Import Relevant Libraries

In [2]:
############################## General ##############################
import os
import itertools
import math
import ast


import time
from tqdm import tqdm


############################## Data Handling ##############################
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from collections import defaultdict, Counter

from scipy import stats

import statsmodels.api as sm
from statsmodels.formula.api import ols

############################## SKLearn ##############################
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, r2_score, accuracy_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import linear_model
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.manifold import TSNE

############################## Data Visualization ##############################
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
sns.set(rc={'figure.figsize':(15,8)})

from prettytable import PrettyTable 
import eli5
from eli5 import show_weights, show_prediction
from eli5.sklearn import PermutationImportance

from yellowbrick.regressor import ResidualsPlot

############################## Network Analysis ##############################
import networkx as nx
from pyvis.network import Network
from nxviz import CircosPlot
from nxviz import ArcPlot
from community import community_louvain

import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.plotting import show

############################## Machine Learning ##############################
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from keras.utils import np_utils

############################## NLP ##############################
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import gensim
from gensim.models import Phrases, LdaModel, CoherenceModel
from gensim.corpora.dictionary import Dictionary
import pyLDAvis.gensim

from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

############################## Oversampling ##############################
from imblearn.over_sampling import RandomOverSampler


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

Define Static Variables

In [3]:
DATA_DIRECTORY = os.getcwd() + "/../Data"

PATH_LYRICS = DATA_DIRECTORY + "/df_lyrics.csv"
PATH_LYRICS_CLEAN = DATA_DIRECTORY + "/df_lyrics_clean.csv"

PATH_ARTISTS = DATA_DIRECTORY + "/artists_with_metadata.csv"
PATH_ARTISTS_CLEAN = DATA_DIRECTORY + "/artists_with_metadata_clean.csv"

PATH_ALBUMS = DATA_DIRECTORY + "/albums_with_metadata.csv"

PATH_TRACKS = DATA_DIRECTORY + "/df_tracks.csv"
PATH_TRACKS_CLEAN = DATA_DIRECTORY + "/df_tracks_clean.csv"

PATH_NODES = DATA_DIRECTORY + "/df_nodes.csv"
PATH_EDGES = DATA_DIRECTORY + "/df_edgelist.csv"

RANDOM = 42

Load relevant data

In [4]:
# Load tracks data
df_tracks = pd.read_csv(PATH_TRACKS, index_col=0, 
                        converters = {"Artist Names": ast.literal_eval,
                                      "Genres": ast.literal_eval})

# Load artists data and drop columns that were not fetched through Spotify
df_artists = pd.read_csv(PATH_ARTISTS, index_col=0, converters = {"Genres": ast.literal_eval})
df_artists = df_artists.drop(['Streams', 'Tracks', '1b+', '100m+', '10m+', '1m+','Last Update', 'href'], axis = 1)

# Load lyrics data
df_lyrics = df_lyrics = pd.read_csv(PATH_LYRICS, index_col=0)
df_lyrics = df_lyrics.drop(['Title_with_featured'], axis = 1)

Section 1: Data Description

Artists Data

In [5]:
df_artists.head()
Out[5]:
Artist Name Follower_Count Genres Artist_ID Popularity
0 Drake 50509972 [canadian hip hop, canadian pop, hip hop, pop ... 3TVXtAsR1Inumwj472S9r4 100
1 Ed Sheeran 71571877 [pop, uk pop] 6eUKZXaKkcviH0Ku9w2n3V 93
2 Post Malone 29076628 [dfw rap, melodic rap, rap] 246dkjvS1zLTtiykXe5h60 95
3 Eminem 38966309 [detroit hip hop, hip hop, rap] 7dGJo4pcD2V6oG8kP0tJRR 94
4 Ariana Grande 52367445 [dance pop, pop, post-teen pop] 66CXWjxzNUsdJxJ2JdwvnR 98
In [6]:
df_artists.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Artist Name     1000 non-null   object
 1   Follower_Count  1000 non-null   int64 
 2   Genres          1000 non-null   object
 3   Artist_ID       1000 non-null   object
 4   Popularity      1000 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 46.9+ KB
In [7]:
df_artists.describe()
Out[7]:
Follower_Count Popularity
count 1.000000e+03 1000.000000
mean 4.328741e+06 78.815000
std 6.043165e+06 6.371764
min 1.207500e+04 48.000000
25% 1.386252e+06 74.000000
50% 2.499266e+06 78.000000
75% 4.600943e+06 83.000000
max 7.157188e+07 100.000000
In [8]:
# Count frequency of each genre
genre_count = defaultdict(int)

for genres in df_artists["Genres"]:
    for genre in genres:
        genre_count[genre] += 1
In [9]:
print(f"Number of genres appearing in dataset: {len(genre_count.keys())}")
Number of genres appearing in dataset: 614
In [10]:
# Sort genre dictionary by value
genre_count_sorted = sorted(genre_count.items(), key=lambda item: item[1], reverse=True)
In [11]:
# Print top 10 most frequent genres
genre_count_sorted[:10]
Out[11]:
[('pop', 278),
 ('dance pop', 210),
 ('pop dance', 156),
 ('pop rap', 147),
 ('rap', 124),
 ('rock', 122),
 ('post-teen pop', 101),
 ('latin', 93),
 ('hip hop', 91),
 ('modern rock', 77)]
In [12]:
# Print bottom 10 least occuring genres
genre_count_sorted[-10:]
Out[12]:
[('early music', 1),
 ('german baroque', 1),
 ('deep underground hip hop', 1),
 ('argentine reggae', 1),
 ('bronx hip hop', 1),
 ('aussietronica', 1),
 ('swedish indie rock', 1),
 ('ambient pop', 1),
 ('el paso indie', 1),
 ('shoegaze', 1)]
In [13]:
# Determine number of genres associated to each artist
genres_per_artist = [len(genres) for genres in df_artists["Genres"]]
In [14]:
min(genres_per_artist)
Out[14]:
0
In [15]:
# Find all artists that don't have any genres
[i for i, x in enumerate(genres_per_artist) if x == 0]
Out[15]:
[528]
In [16]:
df_artists.iloc[528,:]["Artist Name"]
Out[16]:
'R. Kelly'
In [17]:
max(genres_per_artist)
Out[17]:
12

Tracks Data

In [18]:
df_tracks.head()
Out[18]:
Track Name Artist Names Album Name Duration in ms Explicit Popularity Track ID Artist IDs Album ID Href URI Acousticness Danceability Energy Instrumentalness Liveness Loudness Speechiness Valence Tempo Time Signature Key Mode Release Date Genres
0 Deep Pockets [Drake] Dark Lane Demo Tapes 222928 True 63 3IvMYBE7A3c7to1aEcfFJk ['3TVXtAsR1Inumwj472S9r4'] 6OQ9gBfg5EXeNAEwGSs6jK https://api.spotify.com/v1/tracks/3IvMYBE7A3c7... spotify:track:3IvMYBE7A3c7to1aEcfFJk 0.482 0.473 0.824 0.000000 0.6050 -3.680 0.163 0.3740 77.888 4.0 7.0 0.0 2020-05-01 [canadian hip hop, canadian pop, hip hop, pop ...
1 When To Say When [Drake] Dark Lane Demo Tapes 223124 True 65 5TCBWmEBrin7etRa4Lswr1 ['3TVXtAsR1Inumwj472S9r4'] 6OQ9gBfg5EXeNAEwGSs6jK https://api.spotify.com/v1/tracks/5TCBWmEBrin7... spotify:track:5TCBWmEBrin7etRa4Lswr1 0.252 0.410 0.820 0.000000 0.5380 -6.808 0.533 0.5260 170.718 4.0 1.0 1.0 2020-05-01 [canadian hip hop, canadian pop, hip hop, pop ...
2 Chicago Freestyle (feat. Giveon) [Drake, Giveon] Dark Lane Demo Tapes 220487 True 84 4wVOKKEHUJxHCFFNUWDn0B ['3TVXtAsR1Inumwj472S9r4', '4fxd5Ee7UefO4CUXgw... 6OQ9gBfg5EXeNAEwGSs6jK https://api.spotify.com/v1/tracks/4wVOKKEHUJxH... spotify:track:4wVOKKEHUJxHCFFNUWDn0B 0.629 0.735 0.449 0.000000 0.1130 -7.507 0.347 0.0397 122.947 4.0 10.0 1.0 2020-05-01 [canadian hip hop, canadian pop, hip hop, pop ...
3 Not You Too (feat. Chris Brown) [Drake, Chris Brown] Dark Lane Demo Tapes 269680 True 68 3Q4gttWQ6hxqWOa3tHoTNi ['3TVXtAsR1Inumwj472S9r4', '7bXgB6jMjp9ATFy66e... 6OQ9gBfg5EXeNAEwGSs6jK https://api.spotify.com/v1/tracks/3Q4gttWQ6hxq... spotify:track:3Q4gttWQ6hxqWOa3tHoTNi 0.342 0.458 0.452 0.000019 0.0703 -9.299 0.047 0.3160 86.318 4.0 9.0 0.0 2020-05-01 [canadian hip hop, canadian pop, hip hop, pop ...
4 Toosie Slide [Drake] Dark Lane Demo Tapes 247058 True 80 466cKvZn1j45IpxDdYZqdA ['3TVXtAsR1Inumwj472S9r4'] 6OQ9gBfg5EXeNAEwGSs6jK https://api.spotify.com/v1/tracks/466cKvZn1j45... spotify:track:466cKvZn1j45IpxDdYZqdA 0.289 0.830 0.490 0.000003 0.1130 -8.820 0.209 0.8450 81.604 4.0 1.0 0.0 2020-05-01 [canadian hip hop, canadian pop, hip hop, pop ...
In [19]:
df_tracks.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 445578 entries, 0 to 445577
Data columns (total 25 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Track Name        445578 non-null  object 
 1   Artist Names      445578 non-null  object 
 2   Album Name        445578 non-null  object 
 3   Duration in ms    445578 non-null  int64  
 4   Explicit          445578 non-null  bool   
 5   Popularity        445578 non-null  int64  
 6   Track ID          445578 non-null  object 
 7   Artist IDs        445578 non-null  object 
 8   Album ID          445578 non-null  object 
 9   Href              445578 non-null  object 
 10  URI               445578 non-null  object 
 11  Acousticness      445522 non-null  float64
 12  Danceability      445522 non-null  float64
 13  Energy            445522 non-null  float64
 14  Instrumentalness  445522 non-null  float64
 15  Liveness          445522 non-null  float64
 16  Loudness          445522 non-null  float64
 17  Speechiness       445522 non-null  float64
 18  Valence           445522 non-null  float64
 19  Tempo             445522 non-null  float64
 20  Time Signature    445522 non-null  float64
 21  Key               445522 non-null  float64
 22  Mode              445522 non-null  float64
 23  Release Date      445578 non-null  object 
 24  Genres            445578 non-null  object 
dtypes: bool(1), float64(12), int64(2), object(10)
memory usage: 85.4+ MB

The tracks dataframe contains 445578 records in total. All but 56 of those tracks are fully populated. Those 56 tracks are missing the audio features.

In [20]:
df_tracks.describe()
Out[20]:
Duration in ms Popularity Acousticness Danceability Energy Instrumentalness Liveness Loudness Speechiness Valence Tempo Time Signature Key Mode
count 4.455780e+05 445578.000000 445522.000000 445522.000000 445522.000000 445522.000000 445522.000000 445522.000000 445522.000000 445522.000000 445522.000000 445522.000000 445522.000000 445522.000000
mean 2.256724e+05 17.070030 0.516985 0.492722 0.461233 0.280951 0.225263 -13.062790 0.091137 0.441015 115.106474 3.814826 5.217013 0.674882
std 1.326496e+05 17.842596 0.403870 0.188886 0.307554 0.391698 0.216674 8.216911 0.128682 0.260593 31.321431 0.593445 3.520897 0.468419
min 1.280000e+03 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -60.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1.599332e+05 2.000000 0.073200 0.349000 0.162000 0.000000 0.098700 -19.353000 0.036800 0.220000 90.511000 4.000000 2.000000 0.000000
50% 2.112400e+05 11.000000 0.537000 0.491000 0.465000 0.001250 0.130000 -10.315000 0.045700 0.418000 114.400500 4.000000 5.000000 1.000000
75% 2.639330e+05 29.000000 0.963000 0.636000 0.738000 0.793000 0.267000 -6.211000 0.075400 0.644000 135.534000 4.000000 8.000000 1.000000
max 4.796395e+06 96.000000 0.996000 0.989000 1.000000 1.000000 1.000000 0.932000 0.971000 1.000000 248.060000 5.000000 11.000000 1.000000
In [21]:
df_tracks["Track Name"].value_counts()
Out[21]:
Intro                                                                                                          472
Outro                                                                                                          155
Aria mit 30 Veränderungen, BWV 988 "Goldberg Variations": Aria                                                 155
Violin Concerto No.2 In E, BWV 1042: 3. Allegro assai                                                          138
Partita No.1 in B flat, BWV 825: 1. Praeludium                                                                 116
                                                                                                              ... 
Missa brevis in D Minor, K. 65/61a: IV. Sanctus                                                                  1
205 - Teuflische Kaffeefahrt - Teil 47                                                                           1
Weihnachtsoratorium, BWV 248 - Teil 6: II. Evangelist. "Da berief Herodes die Weisen heimlich" - Remastered      1
This Is What It Feels Like - Maddix Extended Remix                                                               1
Yo Soy del Barrio (feat. Tego Calderón)                                                                          1
Name: Track Name, Length: 211905, dtype: int64
In [22]:
# Calculate number of different track names in dataset
len(df_tracks["Track Name"].value_counts().keys())
Out[22]:
211905
In [23]:
# Calculate number of different track names that appear more than once in dataset
sum(df_tracks["Track Name"].value_counts() > 1)
Out[23]:
69131
In [24]:
# Calculate number of different track names that appear more than 50 times
sum(df_tracks["Track Name"].value_counts() > 50)
Out[24]:
181

There are several potential duplicate songs in this dataset that have to be dealt with later.

Top Performers
In [25]:
# Most popular songs
df_tracks.sort_values(by="Popularity",ascending=False)[["Track Name","Artist Names","Popularity"]].head(10)
Out[25]:
Track Name Artist Names Popularity
2667 Blinding Lights [The Weeknd] 96
87014 What You Know Bout Love [Pop Smoke] 96
87002 For The Night (feat. Lil Baby & DaBaby) [Pop Smoke, Lil Baby, DaBaby] 95
44818 ROCKSTAR (feat. Roddy Ricch) [DaBaby, Roddy Ricch] 94
35660 Watermelon Sugar [Harry Styles] 94
266686 Heather [Conan Gray] 93
1924 positions [Ariana Grande] 92
46192 Someone You Loved [Lewis Capaldi] 91
3693 Una Locura [Ozuna, J Balvin, Chencho Corleone] 91
87012 Mood Swings (feat. Lil Tjay) [Pop Smoke, Lil Tjay] 91
In [26]:
# Longest Songs
df_tracks.sort_values(by="Duration in ms",ascending=False)[["Track Name","Artist Names","Duration in ms"]].head()
Out[26]:
Track Name Artist Names Duration in ms
200854 Until Now [Swedish House Mafia] 4796395
43840 Continuous Mix [Tiësto] 4777826
324182 Don Giovanni, K. 527: Act I (Live) [Wolfgang Amadeus Mozart, Mark Walters, Christ... 4750718
42865 Continuous Mix Nyana, mix 2 [Tiësto] 4737986
43571 Continuous Mix 1 [Tiësto] 4684626
In [27]:
# Most Acoustic Songs
df_tracks.sort_values(by="Acousticness",ascending=False)[["Track Name","Artist Names","Acousticness"]].head()
Out[27]:
Track Name Artist Names Acousticness
233389 24 Préludes, Op.28: 6. In B Minor [Frédéric Chopin, Friedrich Gulda] 0.996
234237 Preludes, Op. 28: No. 1 in C Major. Agitato [Frédéric Chopin, Bianca Sitzius] 0.996
334385 Così fan tutte, K. 588: Act II Scene 4: Bella ... [Wolfgang Amadeus Mozart, Elisabeth Schwarzkop... 0.996
334386 Così fan tutte, K. 588: Act II Scene 4: Sani e... [Wolfgang Amadeus Mozart, Elisabeth Schwarzkop... 0.996
334387 Così fan tutte, K. 588: Act II Scene 4: Fortun... [Wolfgang Amadeus Mozart, Elisabeth Schwarzkop... 0.996
In [28]:
# Songs to dance to
df_tracks.sort_values(by="Danceability",ascending=False)[["Track Name","Artist Names","Danceability"]].head()
Out[28]:
Track Name Artist Names Danceability
242603 Raise Your Weapon - Weiss Remix [deadmau5, WEISS] 0.989
268417 Faded (Tâches Remix) [ZHU] 0.987
377274 Plus Something Else - Remastered Version [Eurythmics, Annie Lennox, Dave Stewart] 0.987
377258 Plus Something Else - Remastered Version [Eurythmics, Annie Lennox, Dave Stewart] 0.987
268438 Faded - Taches Remix [ZHU] 0.987
In [29]:
# High Energy Songs
df_tracks.sort_values(by="Energy",ascending=False)[["Track Name","Artist Names","Energy"]].head()
Out[29]:
Track Name Artist Names Energy
99495 I Feel Love - Omar Sherif Remix [Mix Cut] [CRØW, Omar Sherif] 1.0
382034 Zapatos Viejos - Remix [Gloria Trevi] 1.0
382033 Dr. Psiquiatra - Remix [Gloria Trevi] 1.0
342045 Piano Concerto No. 23 in A Major, K. 488: Appl... [Wolfgang Amadeus Mozart, Robert Casadesus, Fr... 1.0
342049 Piano Concerto No. 24 in C Minor, K. 491: Appl... [Wolfgang Amadeus Mozart, Robert Casadesus, Fr... 1.0
In [30]:
# Songs that put you in a good mood
df_tracks.sort_values(by="Valence",ascending=False)[["Track Name","Artist Names","Valence"]].head()
Out[30]:
Track Name Artist Names Valence
118818 La Suata [La Arrolladora Banda El Limón De Rene Camacho] 1.0
152965 Poema Mudo [Joan Sebastian] 1.0
119219 La Suata - Vers.Radio [La Arrolladora Banda El Limón De Rene Camacho] 1.0
119049 La Suata [La Arrolladora Banda El Limón De Rene Camacho] 1.0
119028 La Suata [La Arrolladora Banda El Limón De Rene Camacho] 1.0
In [31]:
# Songs that put you in a bad mood
df_tracks.sort_values(by="Valence",ascending=True)[["Track Name","Artist Names","Valence"]].head()
Out[31]:
Track Name Artist Names Valence
413537 Christmas Oratorio, BWV 248, Cantata 5: Evange... [Johann Sebastian Bach, Howard Crook, Philippe... 0.0
197660 Miracle Cure - Live At Hull Version [The Who, Andy Macpherson, Jon Astley] 0.0
197628 Miracle Cure - 40th Anniversary Version - Live... [The Who, Andy Macpherson, Jon Astley] 0.0
80499 My Perception [YG, Slim 400] 0.0
71409 THE PAIN - Interlude [Tove Lo] 0.0
In [32]:
# Very fast songs
df_tracks.sort_values(by="Tempo",ascending=False)[["Track Name","Artist Names","Tempo"]].head()
Out[32]:
Track Name Artist Names Tempo
242578 Sofi Needs A Ladder - Pig&Dan Remix [deadmau5, Pig&Dan] 248.060
242599 Sofi Needs A Ladder - Pig&Dan Remix [deadmau5, Pig&Dan] 247.801
274019 I Don't Want You on My Mind [Bill Withers] 243.507
55452 Talk With You (Demo) [Remastered] [Fleetwood Mac] 241.005
276673 It's So Hard - Take 11 / Raw Studio Mix [John Lennon] 240.699
In [33]:
# Very slow songs
df_tracks.sort_values(by="Tempo",ascending=True)[["Track Name","Artist Names","Tempo"]].head()
Out[33]:
Track Name Artist Names Tempo
411692 St. Matthew Passion, BWV 244, Pt. 2: No. 45b, ... [Johann Sebastian Bach, Suddeutscher Madrigalc... 0.0
350189 The One - Instrumental, F9 Megamix [Kylie Minogue] 0.0
69920 My Dad - Interlude [The Notorious B.I.G.] 0.0
69914 The Greatest Rapper (Interlude) [The Notorious B.I.G.] 0.0
69897 My Dad - Interlude [The Notorious B.I.G.] 0.0
In [34]:
df_tracks.columns
Out[34]:
Index(['Track Name', 'Artist Names', 'Album Name', 'Duration in ms',
       'Explicit', 'Popularity', 'Track ID', 'Artist IDs', 'Album ID', 'Href',
       'URI', 'Acousticness', 'Danceability', 'Energy', 'Instrumentalness',
       'Liveness', 'Loudness', 'Speechiness', 'Valence', 'Tempo',
       'Time Signature', 'Key', 'Mode', 'Release Date', 'Genres'],
      dtype='object')

Lyrics Data

In [35]:
df_lyrics.head()
Out[35]:
Artist Title Lyrics
0 Drake God’s Plan [Intro]\nAnd they wishin' and wishin' and wish...
1 Drake In My Feelings [Intro: Drake]\nTrap, TrapMoneyBenny\nThis shi...
2 Drake Hotline Bling [Intro]\nYou used to call me on my\nYou used t...
3 Drake One Dance [Intro: Kyla]\nBaby, I like your style\n\n[Ver...
4 Drake Hold On, We’re Going Home [Produced by Nineteen85, Majid Jordan & Noah "...
In [36]:
df_lyrics.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 24260 entries, 0 to 24259
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Artist  24260 non-null  object
 1   Title   24260 non-null  object
 2   Lyrics  24128 non-null  object
dtypes: object(3)
memory usage: 758.1+ KB

Section 2: Data Preparation

Determination of Artists' Genres

In [37]:
df_artists.explode("Genres")["Genres"].value_counts()
Out[37]:
pop              278
dance pop        210
pop dance        156
pop rap          147
rap              124
                ... 
pop violin         1
classical era      1
boston rock        1
operatic pop       1
indian edm         1
Name: Genres, Length: 614, dtype: int64

Assign each artist the first genre it matches.

In [38]:
def pick_single_genres(df_art):
    """
    Picks the first matched genre of the genre frequency list and assigns the first match to the artist
    """
    # Generates a new columns holding NaN values
    df_art["Genre"] = np.nan
    # List looped through holds items order by occurence in the list of artists in a descending order
    for genre in list(pd.DataFrame(df_art.explode("Genres")["Genres"].value_counts()).index):
        # takes all the rows in the df which do not have a value for the Genre column assigned yet
        for index, elem_empty, elem in zip(df_art[df_art["Genre"].isnull()].index, 
                                                    df_art[df_art["Genre"].isnull()]["Genre"], 
                                                    df_art[df_art["Genre"].isnull()]["Genres"]):
            # loops through the list of genres assigned to an artist and if matches with genre loop position 
            # it will assign the most popular genre the artist matches (since looped based on appearence and
            # therefore matched element of row is sorted out since it does not hold a NaN value anymore)
            for list_elem in elem:
                if list_elem == genre:
                    df_art.loc[index, "Genre"] = list_elem

Apply function.

In [39]:
pick_single_genres(df_artists)

Check how many difference genres the dataset still holds and their distribution.

In [40]:
df_artists["Genre"].nunique()
Out[40]:
113
In [41]:
pd.DataFrame(df_artists["Genre"].value_counts()).head(20)
Out[41]:
Genre
pop 278
rock 114
latin 83
pop rap 77
rap 42
dance pop 40
contemporary country 34
pop dance 20
alternative metal 19
modern rock 17
german hip hop 16
regional mexican 14
adult standards 11
sertanejo universitario 11
k-pop 10
trap latino 9
pop rock 9
soul 8
french hip hop 8
latin pop 6

Consolidate genres that are likely to belong together based on their names.

In [42]:
df_artists["Genre"] = df_artists["Genre"].replace({"dance pop": "pop dance",
                                                   "modern rock": "rock",
                                                   "contemporary country": "country",
                                                   "k-pop": "pop",
                                                   "trap latino": "latin",
                                                   "pop urbaine": "pop",
                                                   "rock en espanol": "rock",
                                                   "pop nacional": "pop",
                                                   "melodic rap": "rap",
                                                   "electropop": "pop",
                                                   "indie pop": "pop",
                                                   "australian pop": "pop",
                                                   "emo rap": "rap",
                                                   "swedish pop": "pop",
                                                   "europop": "pop",
                                                   "art pop": "pop",
                                                   "dutch pop": "pop",
                                                   "bow pop": "pop",
                                                   "post-teen pop": "pop",
                                                   "soft rock": "rock",
                                                    })

Create the list with genres sorted in descending order of occurrence.

In [43]:
genre_list = pd.DataFrame(df_artists["Genre"].value_counts())

Filter genre as described above.

In [44]:
genre_list_top_vals = genre_list[genre_list["Genre"] >= 15]
In [45]:
df_artists["Genre"] = df_artists["Genre"].apply(lambda x: x if x in genre_list_top_vals.index else None)
In [46]:
df_artists["Rank"] = df_artists.index
In [47]:
df_artists.to_csv(PATH_ARTISTS_CLEAN)

Joining Selected Artist Genre into Tracks and Lyrics Dataframe

In [48]:
df_tracks["Artist Names"][0]
Out[48]:
['Drake']
In [49]:
df_lyrics_with_genre = pd.merge(left=df_lyrics,
                                right=df_artists[["Artist Name","Genre"]],
                                left_on="Artist",
                                right_on="Artist Name").drop("Artist Name",axis=1)

df_lyrics = df_lyrics_with_genre.copy()
In [50]:
# This block is used to identify the "main" artist of each track based on whose album the track is featured on.
# This artist's genre is then assigned to the song.
df_albums = pd.read_csv(PATH_ALBUMS,index_col=0)
df_tracks_helper = pd.merge(left=df_tracks,
                            right=df_albums[["Album ID","Artist ID"]],
                            left_on="Album ID",
                            right_on="Album ID")
In [51]:
df_tracks_with_genre = pd.merge(left=df_tracks_helper,
                                right=df_artists[["Artist_ID","Genre"]],
                                left_on="Artist ID",
                                right_on="Artist_ID")

df_tracks = df_tracks_with_genre.copy()

Treatment of Duplicate Values

In [52]:
df_tracks["Track Name"].value_counts()
Out[52]:
Intro                                                                                                          472
Outro                                                                                                          155
Aria mit 30 Veränderungen, BWV 988 "Goldberg Variations": Aria                                                 155
Violin Concerto No.2 In E, BWV 1042: 3. Allegro assai                                                          138
Partita No.1 in B flat, BWV 825: 1. Praeludium                                                                 116
                                                                                                              ... 
Missa brevis in D Minor, K. 65/61a: IV. Sanctus                                                                  1
205 - Teuflische Kaffeefahrt - Teil 47                                                                           1
Weihnachtsoratorium, BWV 248 - Teil 6: II. Evangelist. "Da berief Herodes die Weisen heimlich" - Remastered      1
This Is What It Feels Like - Maddix Extended Remix                                                               1
Yo Soy del Barrio (feat. Tego Calderón)                                                                          1
Name: Track Name, Length: 211905, dtype: int64

Several titles appear more than once. However, a title appearing more than once is not necessarily a duplicate, since two different artists can have songs that go by the same titles. Let's visualise the frequency of titles going by the same name occuring.

In [53]:
def draw_duplicate_graph(count_list,bins=[0,1,3,10,100000],labels=["1","2-3","4-10",">10"]):
    
    out = pd.cut(count_list, bins=bins, labels=labels)
    ax = out.value_counts(sort=False).plot.bar(rot=0, color="b", figsize=(6,4))
    plt.show()
draw_duplicate_graph(df_tracks["Track Name"].value_counts())

It can be seen that for the majority of titles, duplicates are no problem at all. Yet, there is still a considerable amount of titles that appear several times for the same artist.

In [54]:
df_tracks[df_tracks["Track Name"] == 'Violin Concerto No.2 In E, BWV 1042: 3. Allegro assai']["Popularity"]
Out[54]:
383027    2
383052    1
383208    0
383525    5
383550    4
         ..
409947    0
410423    1
410761    5
410777    5
410892    4
Name: Popularity, Length: 138, dtype: int64

When looking at the popularity of each instance of 'Violin Concerto No.2 In E, BWV 1042: 3. Allegro assai', which is among the most frequently occuring titles, is becomes appearant that lots of its versions are not listented to very often. For that reason, the first strategy of removing duplicates is by removing all songs from the dataset that have a popularity less then five. This operation also affect non-duplicate songs that are very unpopular which is an intended side-effect to reduce the dataset size and keep it to the most relevant tracks.

In [55]:
df_tracks_pop = df_tracks[df_tracks["Popularity"] > 5]
In [56]:
print(f'Nr. of duplicate titles before: {sum(df_tracks["Track Name"].value_counts() > 1)}')
print(f'Nr. of duplicate titles after: {sum(df_tracks_pop["Track Name"].value_counts() > 1)}')
Nr. of duplicate titles before: 69131
Nr. of duplicate titles after: 48611
In [57]:
draw_duplicate_graph(df_tracks_pop["Track Name"].value_counts())

This step reduced the number of duplicate titles by more than 20,000.
Let's now have a closer look on "Intro", which is the title occuring most frequently.

In [58]:
df_tracks_pop[df_tracks_pop["Track Name"] == "Intro"].head()
Out[58]:
Track Name Artist Names Album Name Duration in ms Explicit Popularity Track ID Artist IDs Album ID Href URI Acousticness Danceability Energy Instrumentalness Liveness Loudness Speechiness Valence Tempo Time Signature Key Mode Release Date Genres Artist ID Artist_ID Genre
2027 Intro [Ariana Grande] Christmas & Chill (Japan Version) 65960 False 27 4QgH3GXHnHuxMJu3RG69Hg ['66CXWjxzNUsdJxJ2JdwvnR'] 2A1KyqHu1DmLtjXpIMNoQq https://api.spotify.com/v1/tracks/4QgH3GXHnHux... spotify:track:4QgH3GXHnHuxMJu3RG69Hg 0.000444 0.713 0.326 0.000000 0.115 -11.776 0.0462 0.3060 119.039 4.0 2.0 0.0 2016-11-18 [dance pop, pop, post-teen pop] 66CXWjxzNUsdJxJ2JdwvnR 66CXWjxzNUsdJxJ2JdwvnR pop
2132 Intro [Ariana Grande] My Everything (Italian Edition) 79840 False 14 0LlAdQc4Vd7wqqbdtKJmQR ['66CXWjxzNUsdJxJ2JdwvnR'] 2LFT0OnfZzsvjcU7bNMM7W https://api.spotify.com/v1/tracks/0LlAdQc4Vd7w... spotify:track:0LlAdQc4Vd7wqqbdtKJmQR 0.969000 0.416 0.182 0.000035 0.339 -12.439 0.0337 0.0977 90.351 4.0 6.0 1.0 2015-06-30 [dance pop, pop, post-teen pop] 66CXWjxzNUsdJxJ2JdwvnR 66CXWjxzNUsdJxJ2JdwvnR pop
2165 Intro [Ariana Grande] My Everything (Deluxe) 79840 False 25 1DfZFyeOTqcrveRLJD2psY ['66CXWjxzNUsdJxJ2JdwvnR'] 6dYDqMHA4COCFC0TfCiUCj https://api.spotify.com/v1/tracks/1DfZFyeOTqcr... spotify:track:1DfZFyeOTqcrveRLJD2psY 0.969000 0.259 0.182 0.000035 0.339 -12.439 0.0388 0.0963 171.476 4.0 6.0 1.0 2015-04-13 [dance pop, pop, post-teen pop] 66CXWjxzNUsdJxJ2JdwvnR 66CXWjxzNUsdJxJ2JdwvnR pop
2183 Intro [Ariana Grande] My Everything 79786 False 23 5h4Nb77ApXkgLhhJcVBMmC ['66CXWjxzNUsdJxJ2JdwvnR'] 3AJFL1V5nHtDvH50DJtxZ3 https://api.spotify.com/v1/tracks/5h4Nb77ApXkg... spotify:track:5h4Nb77ApXkgLhhJcVBMmC 0.969000 0.407 0.183 0.000038 0.331 -12.439 0.0333 0.0955 89.830 4.0 6.0 1.0 2014-08-25 [dance pop, pop, post-teen pop] 66CXWjxzNUsdJxJ2JdwvnR 66CXWjxzNUsdJxJ2JdwvnR pop
2201 Intro [Ariana Grande] My Everything (Deluxe) 79840 False 54 0qMNPhpRzbghJy6G3SgRag ['66CXWjxzNUsdJxJ2JdwvnR'] 6EVYTRG1drKdO8OnIQBeEj https://api.spotify.com/v1/tracks/0qMNPhpRzbgh... spotify:track:0qMNPhpRzbghJy6G3SgRag 0.969000 0.447 0.182 0.000038 0.327 -12.439 0.0324 0.0960 89.814 4.0 6.0 1.0 2014-08-25 [dance pop, pop, post-teen pop] 66CXWjxzNUsdJxJ2JdwvnR 66CXWjxzNUsdJxJ2JdwvnR pop

Some of those tracks are obviously duplicates in the sense that they are the very same song appearing multiple times in the dataset. For example, looking at indices 2165 and 2201, the tracks have the same title, artist, album, and even duration. They are also very similar (albeit not completely the same) in term of their audio features. Similar observations can be made for many other songs in the dataset as well. In all those cases, keeping both/all songs in the dataset would not make a lot of sense. To remove those duplicates, the following procedure is applied:

  1. Group the dataset by artist name and track name
  2. Sort each artist/track combination by their popularity, so that the most popular track is listed first
  3. Only keep the most popular track of each artist/track combination and delete the rest

Below is an example on how dropping the duplicates works illustrated based on tracks called "Intro".

In [59]:
# Only keep Artist Names, Track Name, and Popularity from dataframe filtered by "Intro"
df_tracks_remove_dups = df_tracks_pop[df_tracks_pop["Track Name"]=="Intro"][["Artist Names","Track Name","Popularity"]]

# Group based on Artist Names and Track Name and sort by Popularity
df_tracks_remove_dups["Artist Names"] = df_tracks_remove_dups["Artist Names"].astype(str)
df_tracks_remove_dups_sorted = df_tracks_remove_dups.sort_values(["Artist Names","Track Name","Popularity"],ascending=[False,False,False])

# Print the dataframe filtered by "Intro"
df_tracks_remove_dups_sorted
Out[59]:
Artist Names Track Name Popularity
109544 ['alt-J'] Intro 50
109517 ['alt-J'] Intro 47
109420 ['alt-J'] Intro 42
109406 ['alt-J'] Intro 40
109531 ['alt-J'] Intro 32
... ... ... ...
28901 ['21 Savage', 'Metro Boomin'] Intro 49
28871 ['21 Savage', 'Metro Boomin'] Intro 41
93139 ['2 Chainz'] Intro 20
93457 ['2 Chainz'] Intro 16
93520 ['2 Chainz'] Intro 8

335 rows × 3 columns

In [60]:
# Remove duplicates in each Artist/Track Name pairing, only keeping the first instance
df_unique_songs = df_tracks_remove_dups_sorted.drop_duplicates(subset=['Artist Names', 'Track Name'], keep='first')
df_unique_songs.head()
Out[60]:
Artist Names Track Name Popularity
109544 ['alt-J'] Intro 50
184689 ['Yo Gotti'] Intro 14
97845 ['Yandel'] Intro 14
366706 ['Wu-Tang Clan'] Intro 34
21156 ['Wiz Khalifa'] Intro 31

Now let's apply this procedure to the entire dataset.

In [61]:
df_tracks_pop["Artist Names"] = df_tracks_pop["Artist Names"].astype(str)
df_tracks_sorted = df_tracks_pop.sort_values(["Artist Names","Track Name","Popularity"],ascending=[False,False,False])
df_tracks_no_dups = df_tracks_sorted.drop_duplicates(subset=['Artist Names', 'Track Name'], keep='first')
df_tracks_no_dups
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
Out[61]:
Track Name Artist Names Album Name Duration in ms Explicit Popularity Track ID Artist IDs Album ID Href URI Acousticness Danceability Energy Instrumentalness Liveness Loudness Speechiness Valence Tempo Time Signature Key Mode Release Date Genres Artist ID Artist_ID Genre
203757 龍門澡堂 ['黃雨勳'] 《天台》電影原聲帶 78346 False 7 4seqtMKDbNvZkq2ipbLwiq ['6TwanpBr3fIrx6ITQM9kc0'] 2afPHXn7m4KQECRsVn41us https://api.spotify.com/v1/tracks/4seqtMKDbNvZ... spotify:track:4seqtMKDbNvZkq2ipbLwiq 0.144000 0.607 0.717 0.886000 0.0860 -8.580 0.0361 0.861 118.078 4.0 2.0 0.0 2013-07-11 [c-pop, mandopop, taiwan pop, zhongguo feng] 2elBjNSdBE2Y3f0j1mjrql 2elBjNSdBE2Y3f0j1mjrql None
203768 骰子 ['黃雨勳'] 《天台》電影原聲帶 61080 False 6 4PVXQa9oPuAA3rBFRLNtcT ['6TwanpBr3fIrx6ITQM9kc0'] 2afPHXn7m4KQECRsVn41us https://api.spotify.com/v1/tracks/4PVXQa9oPuAA... spotify:track:4PVXQa9oPuAA3rBFRLNtcT 0.000040 0.294 0.984 0.928000 0.3300 -7.432 0.1060 0.251 108.328 4.0 8.0 1.0 2013-07-11 [c-pop, mandopop, taiwan pop, zhongguo feng] 2elBjNSdBE2Y3f0j1mjrql 2elBjNSdBE2Y3f0j1mjrql None
203760 阿郎的童年 ['黃雨勳'] 《天台》電影原聲帶 66426 False 7 3oNgrVhwvQFOBNdfpsi239 ['6TwanpBr3fIrx6ITQM9kc0'] 2afPHXn7m4KQECRsVn41us https://api.spotify.com/v1/tracks/3oNgrVhwvQFO... spotify:track:3oNgrVhwvQFOBNdfpsi239 0.161000 0.206 0.702 0.978000 0.3790 -10.146 0.0680 0.332 162.785 4.0 2.0 1.0 2013-07-11 [c-pop, mandopop, taiwan pop, zhongguo feng] 2elBjNSdBE2Y3f0j1mjrql 2elBjNSdBE2Y3f0j1mjrql None
203765 逛夜市 ['黃雨勳'] 《天台》電影原聲帶 41680 False 7 0dPu1ryKgufwvl59UNUyMY ['6TwanpBr3fIrx6ITQM9kc0'] 2afPHXn7m4KQECRsVn41us https://api.spotify.com/v1/tracks/0dPu1ryKgufw... spotify:track:0dPu1ryKgufwvl59UNUyMY 0.448000 0.733 0.697 0.000001 0.1810 -8.391 0.0406 0.860 120.041 4.0 10.0 0.0 2013-07-11 [c-pop, mandopop, taiwan pop, zhongguo feng] 2elBjNSdBE2Y3f0j1mjrql 2elBjNSdBE2Y3f0j1mjrql None
203772 英雄之歌 ['黃雨勳'] 《天台》電影原聲帶 54840 False 7 2HIi9oYMjkNiq3g5TVpWb5 ['6TwanpBr3fIrx6ITQM9kc0'] 2afPHXn7m4KQECRsVn41us https://api.spotify.com/v1/tracks/2HIi9oYMjkNi... spotify:track:2HIi9oYMjkNiq3g5TVpWb5 0.000889 0.339 0.678 0.942000 0.4080 -9.961 0.0345 0.373 164.174 4.0 5.0 0.0 2013-07-11 [c-pop, mandopop, taiwan pop, zhongguo feng] 2elBjNSdBE2Y3f0j1mjrql 2elBjNSdBE2Y3f0j1mjrql None
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
297224 Hip Hug-Her ["Booker T. & the M.G.'s", 'The Mar-Keys'] Live at the Monterey International Pop Festival 190266 False 28 2fhLSpE7wlrqdt3Yi0W0Vd ['2vDV0T8sxx2ENnKXds75e5', '4EYRKWX3RvHihDHDr8... 2HD66IaGm71axXSA8hPyNg https://api.spotify.com/v1/tracks/2fhLSpE7wlrq... spotify:track:2fhLSpE7wlrqdt3Yi0W0Vd 0.109000 0.356 0.672 0.835000 0.2000 -8.154 0.0419 0.620 121.115 4.0 10.0 1.0 2019-07-12 [classic soul, funk, memphis soul, soul, soul ... 60df5JBRRPcnSpsIMxxwQm 60df5JBRRPcnSpsIMxxwQm None
297223 Booker-Loo ["Booker T. & the M.G.'s", 'The Mar-Keys'] Live at the Monterey International Pop Festival 246360 False 31 1pVaFVuoK8r8G0QPgeIDAW ['2vDV0T8sxx2ENnKXds75e5', '4EYRKWX3RvHihDHDr8... 2HD66IaGm71axXSA8hPyNg https://api.spotify.com/v1/tracks/1pVaFVuoK8r8... spotify:track:1pVaFVuoK8r8G0QPgeIDAW 0.023300 0.511 0.722 0.744000 0.7540 -7.442 0.0421 0.775 115.523 4.0 2.0 1.0 2019-07-12 [classic soul, funk, memphis soul, soul, soul ... 60df5JBRRPcnSpsIMxxwQm 60df5JBRRPcnSpsIMxxwQm None
104988 The Club ["'In The Heights' Original Broadway Company"] In The Heights (Original Broadway Cast Recording) 358333 False 45 1v4kdQ6zcsgHSxKMHefzKV ['0kvp9mzfvoXvGtjSWTgrEb'] 3VPHalWocJfe7VfbEW60zg https://api.spotify.com/v1/tracks/1v4kdQ6zcsgH... spotify:track:1v4kdQ6zcsgHSxKMHefzKV 0.666000 0.742 0.589 0.000028 0.4540 -6.966 0.1340 0.851 104.760 4.0 7.0 1.0 2008-06-03 [show tunes] 4aXXDj9aZnlshx7mzj3W1N 4aXXDj9aZnlshx7mzj3W1N None
104989 Blackout ["'In The Heights' Original Broadway Company"] In The Heights (Original Broadway Cast Recording) 237800 False 47 7ahhev2IIbivmkRqLjm7g3 ['0kvp9mzfvoXvGtjSWTgrEb'] 3VPHalWocJfe7VfbEW60zg https://api.spotify.com/v1/tracks/7ahhev2IIbiv... spotify:track:7ahhev2IIbivmkRqLjm7g3 0.440000 0.587 0.667 0.000000 0.0778 -5.491 0.0497 0.453 99.727 4.0 7.0 0.0 2008-06-03 [show tunes] 4aXXDj9aZnlshx7mzj3W1N 4aXXDj9aZnlshx7mzj3W1N None
104979 Breathe ["'In The Heights' Original Broadway Company",... In The Heights (Original Broadway Cast Recording) 244066 False 54 6OSTrvIX8JXxRxFT4C9y0J ['0kvp9mzfvoXvGtjSWTgrEb', '2xBOCZLKP7FMXK2dyO... 3VPHalWocJfe7VfbEW60zg https://api.spotify.com/v1/tracks/6OSTrvIX8JXx... spotify:track:6OSTrvIX8JXxRxFT4C9y0J 0.848000 0.397 0.242 0.000000 0.1060 -10.754 0.0663 0.144 171.747 3.0 10.0 1.0 2008-06-03 [show tunes] 4aXXDj9aZnlshx7mzj3W1N 4aXXDj9aZnlshx7mzj3W1N None

172202 rows × 28 columns

In [62]:
# Check if the results are same as before
df_tracks_no_dups[df_tracks_no_dups["Track Name"] == "Intro"].head()
Out[62]:
Track Name Artist Names Album Name Duration in ms Explicit Popularity Track ID Artist IDs Album ID Href URI Acousticness Danceability Energy Instrumentalness Liveness Loudness Speechiness Valence Tempo Time Signature Key Mode Release Date Genres Artist ID Artist_ID Genre
109544 Intro ['alt-J'] An Awesome Wave 157813 False 50 2RlSIKr5HH7RObrI8IvoUB ['3XHO7cRUPCLOr6jwp8vsx5'] 2AxfZb5aQHIXgsB1HA6OLL https://api.spotify.com/v1/tracks/2RlSIKr5HH7R... spotify:track:2RlSIKr5HH7RObrI8IvoUB 0.4450 0.583 0.791 0.722 0.202 -8.571 0.0358 0.288 144.028 4.0 11.0 0.0 2012-05-28 [indie rock, modern rock] 3XHO7cRUPCLOr6jwp8vsx5 3XHO7cRUPCLOr6jwp8vsx5 rock
184689 Intro ['Yo Gotti'] Life 236226 True 14 7rTl0OMHhXi96nqE4PdzHZ ['6Ha4aES39QiVjR0L2lwuwq'] 2E4KE4Wb1c740NJ9HS8cOl https://api.spotify.com/v1/tracks/7rTl0OMHhXi9... spotify:track:7rTl0OMHhXi96nqE4PdzHZ 0.0428 0.882 0.552 0.000 0.114 -6.966 0.0839 0.461 149.960 4.0 11.0 0.0 2010-01-19 [dirty south rap, gangster rap, hip hop, memph... 6Ha4aES39QiVjR0L2lwuwq 6Ha4aES39QiVjR0L2lwuwq pop rap
97845 Intro ['Yandel'] Quien Contra Mi 110680 False 14 4eQ21yEN5yU3AYd4hnKHKX ['0eHQ9o50hj6ZDNBt6Ys1sD'] 1xWGBUugmlHCwXH6AtpQCE https://api.spotify.com/v1/tracks/4eQ21yEN5yU3... spotify:track:4eQ21yEN5yU3AYd4hnKHKX 0.0352 0.411 0.454 0.866 0.127 -9.917 0.0333 0.224 96.718 5.0 7.0 1.0 2003 [latin, latin hip hop, reggaeton, trap latino,... 0eHQ9o50hj6ZDNBt6Ys1sD 0eHQ9o50hj6ZDNBt6Ys1sD latin
366706 Intro ['Wu-Tang Clan'] Wu-Tang Forever 122800 True 34 57C8xbz3dy5G68nrVlqBG8 ['34EP7KEpOjXcM2TCat1ISk'] 4r3TaXjF2b1qwCpxjIpW43 https://api.spotify.com/v1/tracks/57C8xbz3dy5G... spotify:track:57C8xbz3dy5G68nrVlqBG8 0.8660 0.530 0.413 0.000 0.167 -11.599 0.3200 0.742 81.381 4.0 5.0 0.0 1997-06-03 [east coast hip hop, gangster rap, hardcore hi... 34EP7KEpOjXcM2TCat1ISk 34EP7KEpOjXcM2TCat1ISk rap
21156 Intro ['Wiz Khalifa'] O.N.I.F.C. (Deluxe) 40040 True 31 3yNex9JwztwpoTsu0uQZxG ['137W8MRPWKqSmrBGDBFSop'] 5S4SuPHbaozi5PDedAONTG https://api.spotify.com/v1/tracks/3yNex9Jwztwp... spotify:track:3yNex9JwztwpoTsu0uQZxG 0.2770 0.743 0.488 0.739 0.127 -10.488 0.0727 0.598 114.959 4.0 1.0 0.0 2012-12-04 [hip hop, pittsburgh rap, pop rap, rap, southe... 137W8MRPWKqSmrBGDBFSop 137W8MRPWKqSmrBGDBFSop pop rap
In [63]:
df_tracks_no_dups["Track Name"].value_counts()[:20]
Out[63]:
Intro                 151
Outro                  47
Silent Night           40
Home                   39
White Christmas        35
Forever                30
Paradise               30
Angel                  30
Interlude              29
Stay                   28
Heaven                 28
Hello                  27
Winter Wonderland      27
You                    26
Without You            24
Someday                23
Smile                  23
Crazy                  23
The Christmas Song     23
Jingle Bells           22
Name: Track Name, dtype: int64
In [64]:
draw_duplicate_graph(df_tracks_no_dups["Track Name"].value_counts())

This conludes the preprocessing / duplicate removal for the tracks dataset. As can be seen above, there are still lots of titles that appear several times. However, in all cases it seems reasonable to assume that they were mostly performed by different artists and can therefore be considered unique songs.

In [65]:
df_tracks_no_dups = df_tracks_no_dups.sort_index()
In [66]:
df_tracks_no_dups.to_csv(PATH_TRACKS_CLEAN)

Network Analysis Preprocessing

Creation of Nodes Masterdata

In [67]:
# Read in the data
df_tracks = pd.read_csv(PATH_TRACKS_CLEAN, index_col=0, 
                        converters = {"Artist Names": ast.literal_eval})

df_artists = pd.read_csv(PATH_ARTISTS_CLEAN, index_col=0,
                         converters = {"Genres": ast.literal_eval})

Since the tracks number is a external number, it is recalculated with the songs availabe to us from the fetching.

In [68]:
df_track_all_artists = df_tracks.explode("Artist Names")

All songs from artists which are part of df_artists are kept, grouped and counted.

In [69]:
df_artist_tracks = pd.DataFrame(df_track_all_artists[df_track_all_artists["Artist Names"].\
    isin(df_artists["Artist Name"])].\
    groupby(["Artist Names"])["Track Name"].count())

# Rename Column
df_artist_tracks = df_artist_tracks.rename({"Track Name": "Tracks Count"}, axis=1)
In [70]:
df_nodes = pd.merge(df_artists, df_artist_tracks, left_on="Artist Name", right_on="Artist Names", how="left")

Create helper column with the amount of artists collaborating on a song which is the criterion used to filter the data later on.

In [71]:
df_tracks["Artist Count"] = df_tracks["Artist Names"].apply(lambda x: len(x))
In [72]:
df_artist_collabo_tracks = df_tracks.explode("Artist Names")

The exploded data is filter by the amount of artists taking part in the track production. If more than one, the data is kept.

In [73]:
df_artist_collabo_tracks = df_artist_collabo_tracks[df_artist_collabo_tracks["Artist Count"] > 1] 

Same as before. If the artist can be found in df_artists, it is kept, ground and the amount counted.

In [74]:
df_artist_collabo_tracks = pd.DataFrame(df_artist_collabo_tracks[df_artist_collabo_tracks["Artist Names"].\
    isin(df_artists["Artist Name"])].\
    groupby(["Artist Names"])["Track Name"].count())

Rename column for merging.

In [75]:
df_artist_collabo_tracks = df_artist_collabo_tracks.rename({"Track Name": "Tracks Collabo Count"}, axis=1)

Data merging.

In [76]:
df_nodes = pd.merge(df_nodes, df_artist_collabo_tracks, left_on="Artist Name", 
                      right_on="Artist Names", how="left")

Now the share of collaboration from the overall songs produced per artists can be produced.'

In [77]:
df_nodes["Tracks Collabo Share"] = df_nodes["Tracks Collabo Count"]/df_nodes["Tracks Count"]
In [78]:
df_nodes.drop("Tracks Collabo Count", axis=1, inplace=True)
In [79]:
df_nodes["Tracks Count"].fillna(0, inplace=True)
df_nodes["Tracks Collabo Share"].fillna(0, inplace=True)

The artist name Joey Bada$$ leads to issues later on with nx.draw_kamada_kawai. Therefore the artist is renamed

In [80]:
df_nodes.loc[df_nodes["Artist Name"] == "Joey Bada$$", "Artist Name"] = "Joey Badass"

Each genre gets a color assigned for the visualization later on. The colors are taken from a HEX palette generator and tried to take an opposing color for every following element to generate better contrast in network later on

In [81]:
genre_list_top_vals["Genre Color"] = ["#cd6155",
                                      "#566573",
                                      "#99a3a4",
                                      "#3bc14a",
                                      "#5499c7",
                                      "#057476",
                                      "#2980b9",
                                      "#bb8fce",
                                      "#ffe900"]
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/ipykernel_launcher.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':

Filtering down the artists in df_artists if they are part of the top gernres seleted. From now on df_nodes is considered the "master data" for the network analysis.

In [82]:
df_nodes = df_nodes[df_nodes["Genre"].isin(list(genre_list_top_vals.index))]

The genre colors are added to df_nodes by joining them on the genre.

In [83]:
df_nodes
Out[83]:
Artist Name Follower_Count Genres Artist_ID Popularity Genre Rank Tracks Count Tracks Collabo Share
0 Drake 50509972 [canadian hip hop, canadian pop, hip hop, pop ... 3TVXtAsR1Inumwj472S9r4 100 pop rap 0 307.0 0.599349
1 Ed Sheeran 71571877 [pop, uk pop] 6eUKZXaKkcviH0Ku9w2n3V 93 pop 1 124.0 0.395161
2 Post Malone 29076628 [dfw rap, melodic rap, rap] 246dkjvS1zLTtiykXe5h60 95 rap 2 70.0 0.471429
3 Eminem 38966309 [detroit hip hop, hip hop, rap] 7dGJo4pcD2V6oG8kP0tJRR 94 rap 3 318.0 0.433962
4 Ariana Grande 52367445 [dance pop, pop, post-teen pop] 66CXWjxzNUsdJxJ2JdwvnR 98 pop 4 163.0 0.398773
... ... ... ... ... ... ... ... ... ...
992 Brando 12273 [dance pop, pop dance] 5uEeqYFuIChoWKy34jp8xE 74 pop dance 992 8.0 1.000000
993 Ludmilla 3532726 [funk carioca, funk pop, pagode baiano, pop na... 3CDoRporvSjdzTrm99a3gi 75 pop 993 64.0 0.281250
995 Håkan Hellström 237740 [classic swedish pop, gothenburg indie, swedis... 3H7Ez7cwaYw4L3ELy4v3Lc 63 pop 995 160.0 0.037500
997 Bizarrap 1500595 [argentine hip hop, pop venezolano, trap argen... 716NhGYqD1jl2wI1Qkgq36 83 latin 997 0.0 0.000000
998 AOA 558290 [k-pop, k-pop girl group] 54gWVQFHf8IIqbjxAoOarN 57 pop 998 32.0 0.093750

801 rows × 9 columns

In [84]:
genre_list_top_vals
Out[84]:
Genre Genre Color
pop 315 #cd6155
rock 138 #566573
latin 92 #99a3a4
pop rap 77 #3bc14a
pop dance 60 #5499c7
rap 48 #057476
country 36 #2980b9
alternative metal 19 #bb8fce
german hip hop 16 #ffe900
In [85]:
df_nodes = pd.merge(df_nodes,
                    genre_list_top_vals,
                    left_on="Genre", 
                    right_index=True, 
                    how="left",
                    suffixes=('', '_y')).drop(columns=["Genre_y"])

The popularity score and number of tracks by each artist could lead to interesting insights in the analysis, however, they are too granular. Therefore, new columns with bins are added.

Popularity

Look at the characteristics of Popularity variable.

In [86]:
df_nodes["Popularity"].describe()
Out[86]:
count    801.000000
mean      79.308365
std        6.391678
min       57.000000
25%       75.000000
50%       79.000000
75%       83.000000
max      100.000000
Name: Popularity, dtype: float64
In [87]:
sns.distplot(df_nodes["Popularity"])
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Out[87]:
<AxesSubplot:xlabel='Popularity', ylabel='Density'>

Calculate the range of values.

In [88]:
df_nodes["Popularity"].max() - df_nodes["Popularity"].min()
Out[88]:
43

Since it is a low range and is somehow normally distributed, a linear assignment to the bins is chosen. To have a clear cutoff three or six bins seem ideal. Three bins are chosen two get a broader picture.
Note: The numbers are added to avoid manuel sorting later on.

In [89]:
df_nodes["Popularity Bins"] = pd.cut(df_nodes["Popularity"], 3, labels=["3: Less Popular", 
                                                                        "2: Popular", 
                                                                        "1: Very Popular"])
Tracks

Look at the characteristics of Tracks Count variable.

In [90]:
df_nodes["Tracks Count"].describe()
Out[90]:
count     801.000000
mean      148.475655
std       202.584068
min         0.000000
25%        46.000000
50%        97.000000
75%       184.000000
max      3416.000000
Name: Tracks Count, dtype: float64
In [91]:
sns.distplot(df_nodes["Tracks Count"])
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Out[91]:
<AxesSubplot:xlabel='Tracks Count', ylabel='Density'>
In [92]:
df_nodes["Tracks Count"].max()
Out[92]:
3416.0
In [93]:
df_nodes[df_nodes["Tracks Count"] == 3416]
Out[93]:
Artist Name Follower_Count Genres Artist_ID Popularity Genre Rank Tracks Count Tracks Collabo Share Genre Color Popularity Bins
907 Grateful Dead 1086013 [classic rock, cosmic american, country rock, ... 4TMHGUX5WI7OOm53PqSDAT 73 rock 907 3416.0 0.004977 #566573 2: Popular

In contrast to the popularity, the the artists produce a very different amount of tracks. It shows a right skewed distribution and the most diligent artist in the dataset, Grateful Dead, produce 3543 songs, where the average artist produced only 147 songs. Therefore, bins are create manually at own descretion to cover different artist types best.

In [94]:
def create_track_bins(df_artists):
    df_artists["Tracks Bins"] = 0
    df_artists.loc[df_artists["Tracks Count"] <= 25, "Tracks Bins"] = "1: Up to 25"
    df_artists.loc[(df_artists["Tracks Count"] > 25) & (df_artists["Tracks Count"] <= 50), 
                   "Tracks Bins"] = "2: Up to 50"
    df_artists.loc[(df_artists["Tracks Count"] > 50) & (df_artists["Tracks Count"] <= 100), 
                   "Tracks Bins"] = "3: Up to 100"
    df_artists.loc[(df_artists["Tracks Count"] > 100) & (df_artists["Tracks Count"] <= 200), 
                   "Tracks Bins"] = "4: Up to 200"
    df_artists.loc[(df_artists["Tracks Count"] > 200) & (df_artists["Tracks Count"] <= 300), 
                   "Tracks Bins"] = "5: Up to 300"
    df_artists.loc[(df_artists["Tracks Count"] > 300) & (df_artists["Tracks Count"] <= 500), 
                   "Tracks Bins"] = "6: Up to 500"
    df_artists.loc[df_artists["Tracks Count"] > 500, "Tracks Bins"] = "7: Over 500"

Applying bin creation for tracks.

In [95]:
create_track_bins(df_nodes)
In [96]:
df_nodes.to_csv(PATH_NODES)

Creation of Edgelist

In [97]:
def create_edgelist(artist_list):
    """
    Creates a weighted edgelist based on artist collaborations.
    
        Parameters:
            artist_list (list): List of lists of artists who collaborated on songs.
        
        Returns:
            df_edgelist (dataframe): Pandas dataframe that shows number of collaboations between all artists. 
    """
    
    collaboration_dict = defaultdict(int)
    
    # Iterate over all artist collaborations
    for collaboration in artist_list:
    
    # Check whether more than one artist created the song
        if len(collaboration) > 1:
    
        # Get all possible combinations of artists
            combinations = list(itertools.combinations(sorted(collaboration), 2))
    
            # Increment count of artist collaborations by one
            for combination in combinations:
                collaboration_dict[combination] += 1
            
    # Convert dict to dataframe
    df_edgelist = pd.Series(collaboration_dict).reset_index()
    df_edgelist.columns = ['Artist_1', 'Artist_2', 'Weight']
    # Necessary to reduce the absolute value since they resulted in too broad lines in the visualizations
    df_edgelist["Weight for Viz"] = df_edgelist["Weight"]/10
    
    # Return edgelist dataframe
    return df_edgelist

Call function to generate edgelist for all artists.

In [98]:
df_edgelist = create_edgelist(list(df_tracks["Artist Names"]))
In [99]:
len(df_edgelist)
Out[99]:
37857

Unfilterted the network holds over 37 thousand edges in its network.

By applying a filter, which excluded all artists that cannot be assigned to the top 1000 artists for both nodes, the number of edges drops significantly. This step is taken and even narrowed down further by only including the artists which were left after the genre filtering.

In [100]:
df_edgelist = df_edgelist[(df_edgelist["Artist_1"].isin(df_nodes["Artist Name"])) & 
                                (df_edgelist["Artist_2"].isin(df_nodes["Artist Name"]))]
In [101]:
len(df_edgelist)
Out[101]:
4387

By doing so, the network which is further analyzed holds only around 4.4 thousand edges compared to over 37 thousand before any filtering.

This concludes the preprocessing of the data used in the network analysis. However, in the following steps the dataframe is enriched with additional data generated by the network.

In [102]:
df_edgelist.to_csv(PATH_EDGES)

Text Preprocessing of Lyrics

Define functions

In [103]:
def verse_info_removal(df_col):
    """
    Removes all verse info provided in square brackets such as "[intro]" or "[chorus]" from the string and returns
    the cleaned string.
    
    input:
    df_col (str):
    
    output:
    cleaned string
    """
    print("Removing verse info...\n")
    
    return df_col.apply(lambda x: re.sub(r"\[(.*?)\]", "", str(x))) 
In [104]:
def detect_language(df_col):
    
    lans = []
    
    for index,text in tqdm(enumerate(df_col)):
        
        try:
            lans.append(detect(text))
        except:
            print(f'Exception thrown for text "{text}"at index {index}.')
            lans.append("Not classified")
    return lans
In [105]:
def tokenization(df_col):
    """
    Takes a list with strings and returns a list with tokens
    """
    print("Tokenizing words...\n")
    return df_col.apply(lambda x: word_tokenize(x))
In [106]:
def not_appender(df_col):
    """
    Takes from the tokens the "n't" as result from negative contradictions and append it to the following word.
    The adjusted token list is returned. 
    """
    print("Including negation in words...\n")
    df_col = df_col.apply(lambda x: " ".join(x))
    df_col = df_col.apply(lambda x: x.replace("n't ", "not_"))
    df_col = df_col.apply(lambda x: x.split(" "))
    return df_col
In [107]:
def only_alphabetic(df_col):
    """
    Keeps only tokens which are alphabetic or an underscore and returns them.
    """
    print("Removing all non-alphabetic words...\n")
    return df_col.apply(lambda x: [token for token in x if re.match("^[a-zA-Z0_]*$", token)])
In [108]:
def lowercase(df_col):
    """
    Returns all tokens in lowercase.
    """
    print("Making all words lowercase...\n")
    return df_col.apply(lambda x: [token.lower() for token in x])
In [109]:
stop_words = set(stopwords.words('english'))
stop_words.update(["yeah","wanna","oh","ooh","la","lala","lalala", "got","na","ayy","yo","tryna","damn","huh","ai","ayy"])
                   #"like","know","time","never","back","want","make","come","take","feel","right","need", # removed from top words
                   #"could","tell","live","keep","would","thing","still","said","every","little","around",
                   #"cause","really","well","something"])

def stopword_removal(df_col):
    """
    Removes all words considered as stopwords and all words that have a length of three or less.
    """
    
    print("Removing Stopwords...\n")

    return df_col.apply(lambda x: [token for token in x if token not in stop_words and len(token) > 3])
In [110]:
def lemmatization(df_col):
    """
    Applies lemmatization to all tokens and returns them afterwards.
    """
    
    print("Lemmatizing words...\n")
    
    lemmatizer = WordNetLemmatizer()
    return df_col.apply(lambda x: [lemmatizer.lemmatize(token) for token in x])
In [111]:
def preprocessing(df, tokenized_YN = 1, *steps):
    """
    Takes in a dataframe and selects the column with the text and applies preprocessing steps given 
    in and returns either tokens or a string, as determined.
    
    Input:
    - df (dataframe): The dataframe containing the text column.
    - tokenzied (boolean): Returns if 1 a list of tokens, for 0 a string.
    - steps (functions): Multiple functions for preprocessing can be given in.
    
    Output:
    - Either a list with lists or a list with strings.
    """
    # copying over the column for preprocessing
    temp = df["Lyrics Raw"].copy()
    for func in steps:
        temp = func(temp)
    return temp if tokenized_YN == 1 else temp.apply(lambda x: " ".join([token for token in x]))
    

Load in relevant data

In [112]:
df_lyrics = df_lyrics_with_genre.copy()
df_lyrics.head()
Out[112]:
Artist Title Lyrics Genre
0 Drake God’s Plan [Intro]\nAnd they wishin' and wishin' and wish... pop rap
1 Drake In My Feelings [Intro: Drake]\nTrap, TrapMoneyBenny\nThis shi... pop rap
2 Drake Hotline Bling [Intro]\nYou used to call me on my\nYou used t... pop rap
3 Drake One Dance [Intro: Kyla]\nBaby, I like your style\n\n[Ver... pop rap
4 Drake Hold On, We’re Going Home [Produced by Nineteen85, Majid Jordan & Noah "... pop rap

Execution

Get preprocessed dataframe columns

In [113]:
# Rename Lyrics column to Lyrics Raw
df_lyrics.rename(columns={"Lyrics":"Lyrics Raw"},inplace=True)
In [114]:
# Remove all content inside square brackets and \n from Lyrics
df_lyrics["Lyrics"] = verse_info_removal(df_lyrics["Lyrics Raw"]).apply(lambda x: re.sub(r"\n", ". ", str(x)))

# Strip all leading and trailing whitespace
df_lyrics["Lyrics"] = df_lyrics['Lyrics'].str.strip()

# Remove all empty srings
df_lyrics = df_lyrics[df_lyrics['Lyrics'].map(len) > 0]
Removing verse info...

In [115]:
# Detect language of all song lyrics (function takes quite long to execute)
df_lyrics["Language"] = detect_language(df_lyrics["Lyrics"])
5315it [01:08, 84.62it/s] 
Exception thrown for text ". ."at index 5297.
9817it [02:06, 83.09it/s] 
Exception thrown for text ". ."at index 9803.
Exception thrown for text "."at index 9806.
Exception thrown for text "."at index 9807.
Exception thrown for text "."at index 9808.
10443it [02:14, 112.98it/s]
Exception thrown for text "..."at index 10418.
17676it [03:35, 114.31it/s]
Exception thrown for text "-"at index 17654.
24150it [04:53, 82.15it/s] 
In [116]:
# Print number of occurences of each genre
df_lyrics["Language"].value_counts()
Out[116]:
en                19145
es                 2781
pt                  661
de                  482
fr                  295
it                  182
tl                  152
sv                   77
nl                   77
ko                   59
tr                   51
ro                   29
id                   24
so                   17
ca                   15
hi                   15
pl                   14
vi                    9
et                    9
Not classified        7
cy                    7
sw                    6
af                    6
ja                    5
no                    5
hr                    4
ru                    4
da                    3
zh-cn                 2
cs                    2
sk                    1
bn                    1
lt                    1
sl                    1
sq                    1
Name: Language, dtype: int64
In [117]:
# Only keep song lyrics in english (which also removes not classified songs)
df_lyrics = df_lyrics[df_lyrics["Language"] == "en"].reset_index(drop=True)

# Remove language column
df_lyrics = df_lyrics.drop("Language",axis=1)
In [118]:
# Create column with preprocessed and tokenized lyrics
df_lyrics["Lyrics Clean Tok"] = preprocessing(df_lyrics,
                                              1,
                                              verse_info_removal,
                                              tokenization,
                                              not_appender,
                                              only_alphabetic,
                                              lowercase,
                                              stopword_removal,
                                              lemmatization)
Removing verse info...

Tokenizing words...

Including negation in words...

Removing all non-alphabetic words...

Making all words lowercase...

Removing Stopwords...

Lemmatizing words...

In [119]:
# Create column with preprocessed and un-tokenized lyrics
df_lyrics["Lyrics Clean No Tok"] = preprocessing(df_lyrics,
                                                 0,
                                                 verse_info_removal,
                                                 tokenization,
                                                 not_appender,
                                                 only_alphabetic,
                                                 lowercase,
                                                 stopword_removal,
                                                 lemmatization)
Removing verse info...

Tokenizing words...

Including negation in words...

Removing all non-alphabetic words...

Making all words lowercase...

Removing Stopwords...

Lemmatizing words...

In [120]:
df_lyrics.describe()
Out[120]:
Artist Title Lyrics Raw Genre Lyrics Lyrics Clean Tok Lyrics Clean No Tok
count 19145 19145 19145 16191 19145 19145 19145
unique 859 16288 18928 9 18928 18861 18861
top Lil Skies Home \n Lyrics for this song h... pop . Lyrics for this song have yet to... [lyric, song, released, please, check, back, s... lyric song released please check back song rel...
freq 50 15 28 7152 28 29 29
In [121]:
# Drop empty rows
df_lyrics = df_lyrics.dropna(axis=0).reset_index(drop=True)
In [122]:
# Create column with token count of songs
df_lyrics["Word Count"] = df_lyrics["Lyrics"].str.split().map(lambda x: len(x))
In [123]:
# Drop rows with less than 20 tokens
df_lyrics = df_lyrics[df_lyrics["Word Count"] >= 20].reset_index(drop=True)
In [124]:
df_lyrics["Genre"].value_counts()
Out[124]:
pop                  7129
rock                 3277
pop rap              1888
pop dance            1207
rap                  1196
country               899
alternative metal     451
latin                 107
german hip hop          4
Name: Genre, dtype: int64

German hip hop is hardly represented as a genre in the lyrics dataset, which makes sense given that all non-English tracks are supposed to be filtered out. Taking a closer look at the lyrics from this gernre, it is appearant that those songs have probably been falsely classified to be English (probably due to the mix of English and German words) and are therefore dropped.

In [125]:
for lyric in df_lyrics[df_lyrics["Genre"] == "german hip hop"]["Lyrics"]:
    print(lyric)
. . . Yeah, oh, this is RAF Camora longside Gentleman, come on. Brighta days, check it out, wouh. . . I carry wata to the roots, yeah. I know where I am going 'cause I know where we all come from. Tell me, who knows the truth yeah?. Everybody run, nobody know where we a run from. . . Ich komm' zurück in mein'n Bezirk und weiß dann wieder, wohin es geht. So viel passiert, doch es bestimmt mein'n Weg. Sie schmieden Pläne, aber alles Fantasie hoch zehn. Schreiben mir auf Insta, doch ich tu', als hätt' ich's nicht geseh'n. Fühle mich bombe. Immer wieder nur auf Tour mit der Bande. Was laberst du mich zu, mein hombre?. Jaja, lass gut sein, danke. Wir sind weder Brüder noch blutsverwandt. Ein Gramm Kokain und du wirst zur Schlampe. Ich wusste, dass ich deine Zukunft kannte. Was siehst du mich so an?. Okay, du ziehst 'ne Gun. Doch sie ist nicht gelad'n, du Piç. Kommst ohne Munition, doch du  erklärst mir den Krieg. . . I carry wata to the roots, yeah. I know where I am going 'cause I know where we all come from. Tell me, who knows the truth yeah?. Everybody run, nobody know where we a run from. We got to overstand a man is just a man. Sometimes we right and sometimes wrong. But we never ever forget our foundation. Come mek we shorten our future plans. . . Ich geb' nur Herz für was ich liebe. Wären es nicht hundert Prozent, wär' dieser Song eine Intrige. 1998 rauch' ich Weed, im Radio läuft „Tabula Rasa“. Heut singt Gentleman die Hook, einfach unfassbar. Ich komm' zurück in mein'n Bezirk, suche, was von damals hinterblieb'n ist. Ess' mit denselben Jungs im selben Pizzaimbiss. Sie schickt Küsse, weil sie in mich verliebt ist. Doch ihre Attitüde riecht nach Business. Wo ich gewohnt hab', sorgten Drogen für Wohlstand. Jetzt sing' ich nur „yo“ ins Mikro und hol' meinen Lohn ab. Sie fragen nach der Show nur, was ich verdiene. Komm'n mit Messer und erzählen was von Frieden. . . I carry wata to the roots, yeah. I know where I am going 'cause I know where we all come from. Tell me, who knows the truth yeah?. Everybody run, nobody know where we a run from. We got to overstand a man is just a man. Sometimes we right and sometimes wrong. But we never ever forget our foundation. Come mek we shorten our future plans. . [Part 3: RAF Camora & . Gentleman. ]. Ich geb' nur Herz für was ich liebe. Wären es nicht hundert Prozent, wär' dieser Song eine Intrige. Um mich herum Kommerzpop auf Bravo Hits. Meine Stimme, Schock, Höhenangst vor der Perspektive. Too much confusion. Travel inna a place away from illusion. So I put my shoes on. Never forget my roots when I move on. Can't put no blues on. Everybody move fast, we just cruise on. Actions fi choose from. Here is the sound of the wickedest fusion. Never forget where we from. While we are . journeying.  throu the jungle of danger . and.  temptation. Seh dem no second to none. This is RAF Camora, Gentleman, now what a musical explosion. . . I carry wata to the roots, yeah. I know where I am going 'cause I know where we all come from. Tell me, who knows the truth yeah?. Everybody run, nobody know where we a run from. We got to overstand a man is just a man. Sometimes we right and sometimes wrong. But we never ever forget our foundation. Come mek we shorten our future plans. . . Dun know seh, we come a long way, we still around. Still a mek music. A wha do dem man bomboclath. See them silent now. Nothing left fi seh. Remember empty barrel mek di most noise enuh. Haha
. . . Shawty, hol mir noch 'n Shot. Shawty, hol mir noch 'n Shot. Shawty, hol mir noch 'n Shot. Shawty, hol mir noch 'n Shot. . . Shawty, hol mir noch 'n Shot. Shawty, hol mir noch 'n Shot. Shawty, hol mir noch 'n Shot. Shawty, hol mir noch 'n Shot. Du bist grad in mei'm Kopf. Du bist grad in mei'm Kopf. Shawty, hol mir noch 'n Shot. Shawty, hol mir noch 'n Shot. . . Shawty, hol mir noch 'n Shot, ja, ey. Shawty, hol mir noch 'n Shot, ja, ey. Shawty, hol mir noch 'n Shot, ja, ey. Shawty, hol mir noch 'n Shot, ein'n Shot, ein'n Shot. Hol mir noch 'n Shot, ja, ey. Shawty, hol mir noch 'n Shot, ja, ey. Shawty, hol mir noch 'n Shot, ja, ey. Shawty, hol mir noch ein'n Shot, ein'n Shot, ein'n Shot. . . Du bist am. Glänzen wie die Diamonds an mei'm Wrist (Mula). Ey, ja, mein Molly ist pink (pink). Hol mir noch ein'n Shot, keine Drinks (ein'n Shot). So viel Weiber im Club, doch ich denk' nur an dich, ja, ja. Ihr seid nicht auf meiner Wave, ja, zu viel Drip, ja, ja. Nenn mich Offset, komm, wir machen Party (ja). Baby, heute Abend bist du meine Cardi (Cardi B). Heute Abend, Baby, bist du mit dem Kaiser, ja. Komm, wir trinken weiter, ja, ja, ja. Hol mir Shots, hol mir Shots. Ohne dich, Baby, ja, bin ich lost. Baby, ja, dein Körper viel zu krass, für dich hab' ich immer Platz. In mei'm Herzen, ey, wie in meinem Loft. . . Shawty, hol mir noch 'n Shot. Shawty, hol mir noch 'n Shot (ja, ja). Shawty, hol mir noch 'n Shot. Shawty, hol mir noch 'n Shot. Du bist grad in mei'm Kopf. Du bist grad in mei'm Kopf. Shawty, hol mir noch 'n Shot. Shawty, hol mir noch 'n Shot. . . Shawty, hol mir noch 'n Shot, ja, ey. Shawty, hol mir noch 'n Shot, ja, ey (Wave). Shawty, hol mir noch 'n Shot, ja, ey (Wave). Shawty, hol mir noch 'n Shot, ein'n Shot, ein'n Shot. Hol mir noch 'n Shot, ja, ey. Shawty, hol mir noch 'n Shot, ja, ey. Shawty, hol mir noch 'n Shot, ja, ey. Shawty, hol mir noch ein'n Shot, ein'n Shot, ein'n Shot. . . Kenne kein Stopp (nein, nein). Gebe Gas, hol' dich ab in 'nem Dodge (skrrt, skrrt). Du bist nice, kurzer Rock (ja, ja). Baby, du verdrehst mir meinen Kopf, ey (meinen Kopf). So viel Weiber im Club, doch ich denk' nur an dich, ja, ja. Du bist nice, Baby, ja, ja, ja, dein Outfit sitzt, ja, ja. . . Hol mir noch ein'n Shot (ja, ja, ihr wisst Bescheid, ja). Hol mir noch ein'n Shot. Shawty, hol mir noch ein'n Shot. Shawty, hol mir noch einen Shot (ja, ja). Noch einen Shot. . . Shawty, hol mir noch 'n Shot (ey). Shawty, hol mir noch 'n Shot (Stay High). Shawty, hol mir noch 'n Shot (Data Luv). Shawty, hol mir noch 'n Shot. Du bist grad in mei'm Kopf. Du bist grad in mei'm Kopf. Shawty, hol mir noch 'n Shot. Shawty, hol mir noch 'n Shot (ey, Ufo361). . . Shawty, hol mir noch 'n Shot, ja, ey. Shawty, hol mir noch 'n Shot, ja, ey. Shawty, hol mir noch 'n Shot, ja, ey. Shawty, hol mir noch 'n Shot, ein'n Shot, ein'n Shot. Hol mir noch 'n Shot, ja, ey. Shawty, hol mir noch 'n Shot, ja, ey. Shawty, hol mir noch 'n Shot, ja, ey. Shawty, hol mir noch ein'n Shot, ein'n Shot, ein'n Shot. . . Wave. Hol mir noch ein'n Shot. Shawty, hol mir noch 'n Shot, ja, ey. Hol mir noch ein'n Shot. Shawty, hol mir noch 'n Shot, ja, ey. Hol mir noch ein'n Shot. Shawty, hol mir noch 'n Shot, ja, ey. Hol mir noch ein'n Shot. Shawty, hol mir noch 'n Shot, ein'n Shot, ein'n Shot. Hol mir noch ein'n Shot. Hol mir noch 'n Shot, ja, ey. Hol mir noch ein'n Shot. Shawty, hol mir noch 'n Shot, ja, ey. Hol mir noch ein'n Shot. Shawty, hol mir noch 'n Shot, ja, ey. Hol mir noch ein'n Shot. Shawty, hol mir noch 'n Shot, ein'n Shot, ein'n Shot
. . . Jimmy, Jimmy, Jimmy. . . Big Drip, every time. Freebandz, Stay High. Meine Chain, Dicka, blendet sie. Stay High, meine Family. Ja, ich schieß' auf die Enemies. Nein, sie haben nicht unsere Energy. . . Big Drip (Ja, ja), every time. Freebandz, Stay High. Meine Chain, Dicka, blendet sie. Stay High, meine Family. Ja, ich schieß' auf die Enemies. Nein, sie haben nicht unsere Energy. . . Big clip, yeah, and you die. Big clip, yeah, you die, yeah. Freebandz, yeah, for life, yeah. Taliban, Stay High, not nice. . . I went and bought out the whole store. I went and fucked on my ol' ho. I had a bottle, some lucid lean. I put the red bottles on my team, dollar. Thinkin' my stone got pink, boughta. Switch my drip, had to switch, martyr. All of my pianos splurgin' in Prada. Walk through looking like a trillion-dollars (Dollar, dollar). Fuck that bitch, I dropped her off at Chanel. Keep it confidential, never can tell. My young niggas shining like a chandelier. We take the crime rate up. . . Big Drip (Ah), every time. Freebandz, Stay High. Meine Chain, Dicka, blendet sie. Stay High, meine Family. Ja, ich schieß' auf die Enemies. Nein, sie haben nicht unsere Energy. . . Big Drip (Ja, ja), every time. Freebandz, Stay High. Meine Chain, Dicka, blendet sie. Stay High, meine Family. Ja, ich schieß' auf die Enemies. Nein, sie haben nicht unsere Energy. . [Bridge 1: Future & . Ufo361. ]. Stay High with me, nigga, stay higher (. Uh-uh. ). Stay High yeah baby, stay higher, uh (. Ja, der Thron ist mein. ). Stay High with me, stay higher, uh (. Bis zum Tod, Stay High. ). Stay higher (. Ja, ja, ja, ja, ihr wisst Bescheid. ). . . Big Drip, ich bin rich-rich, no cap. Ja, mein Benz ist black wie Kodak. Skrrt-skrrt-skrrt – . neue AP. Skrrt-skrrt-skrrt – neue Rolex. Wir sind Legends, wir nehmen dir deine Show weg. Ja, ja, ja, ja. Audemars (Audemars). Piguet Skelly an mei'm Wrist. Chille im Telly mit nicen Chicks. Chill' im Adlon, chill' im Ritz. Ackern hardcore, nein, kein Witz. Ja, hab' ein'n Part von meinem Big Bro. Big Moves, Big Biz, ja. . . Big Drip (Ihr wisst Bescheid), every time. Freebandz (Stay High), Stay High. Meine Chain, Dicka, blendet sie. Stay High, meine Family. Ja, ich schieß' auf die Enemies. Nein, sie haben nicht unsere Energy (Nein). . . Big Drip (Ja, ja), every time (Every time). Freebandz, Stay High.  (Ja). Meine Chain, Dicka, blendet sie (Ja). Stay High, meine Family (Ja). Ja, ich schieß' auf die Enemies (Ja). Nein, sie haben nicht unsere Energy. . [Bridge 2: Future & . Ufo361. ]. Stay High with me, ayy, stay higher (. Ihr wisst Bescheid. ). Stay High, nigga, better stay higher (. Rich, Rich. ). Stay High with me (. Ja. ), ayy, stay high with me (. Ja. ). Stay High (. Ja. ) nigga, stay high, ah. . . Like that shit
. . . Ey, ey, eyy. Ich chill' im Auto, höre, aye, hah-ah (jaa), ja. Ich chill' im Auto, höre Baka Not Nice, ja. Schuhe von Nike. Sie will poppen und sie weiß nicht, wo sie bleibt. Denn sie ruft mich, ey, ja. Jimmy, Jimmy, Jimmy. . . Ich chill' im Auto, höre Baka Not Nice, ja. Schuhe von Nike. Sie will poppen und sie weiß nicht, wo sie bleibt. Denn sie ruft mich über . Base-Chat. Sie will poppen (Baby, pop it), pop, lock and drop it (Baby, pop it). Sie will poppen (ja), pop, lock and drop it (Baby, pop it). „Say My Name“ (ja), von Beyoncé (Baby, pop it). Sie will Haze (Baby, pop it) und Bugattis (ahh), ja. . . Der Presto ist von Nike Off-White. Kanye-Vibes, „All Of The Lights“. Sie will mit mir fahr'n, mit mir heim. Promethazin, keine Lines, keine Lines. Kanye-Vibes, „All Of The Lights“. Keine Lines, keine Lines. Die Pupillen sing'n „All Of The Lights“, jajajaja. (. Jimmy, Jimmy, Jimmy. ). . . Ich chill' im Auto, höre Baka Not Nice, ja. Schuhe von Nike. Sie will poppen und sie weiß nicht, wo sie bleibt. Denn sie ruft mich über . Base-Chat. Sie will poppen (Baby, pop it), pop, lock and drop it (Baby, pop it). Sie will poppen (ja), pop, lock and drop it (Baby, pop it). „Say My Name“ (ja), von Beyoncé (Baby, pop it). Sie will Haze (Baby, pop it) und Bugattis (ahh), ja. . . Lil Uzi, „XO Tour Llif3“. Michael-Kors-Uhr, doch keine Zeit. Sie weiß, dass ich geh' und ja, sie weint. Doch I don't really care if you cry. Wenn du weinst. Ja, mein Baby, lass' ich dich allein. Wenn du weinst, wenn du weinst. Sing' ich für sie „XO Tour Llif3“, jajajaja. (. Jimmy, Jimmy, Jimmy. ). . . Ich chill' im Auto, höre Baka Not Nice, ja. Schuhe von Nike. Sie will poppen und sie weiß nicht, wo sie bleibt. Denn sie ruft mich über . Base-Chat. Sie will poppen (Baby, pop it), pop, lock and drop it (Baby, pop it). Sie will poppen (ja), pop, lock and drop it (Baby, pop it). „Say My Name“ (ja), von Beyoncé (Baby, pop it). Sie will Haze (Baby, pop it) und Bugattis (ahh), ja. . . Ich chill' im Auto, höre Baka Not Nice, ja. Schuhe von Nike. Sie will poppen und sie weiß nicht, wo sie bleibt. Denn sie ruft mich über . Base-Chat
In [126]:
# Drop tracks that belong to German hip hop
df_lyrics = df_lyrics[df_lyrics["Genre"] != "german hip hop"].reset_index(drop=True)
In [127]:
df_lyrics.head()
Out[127]:
Artist Title Lyrics Raw Genre Lyrics Lyrics Clean Tok Lyrics Clean No Tok Word Count
0 Drake God’s Plan [Intro]\nAnd they wishin' and wishin' and wish... pop rap . And they wishin' and wishin' and wishin' and... [wishin, wishin, wishin, wishin, movin, calm, ... wishin wishin wishin wishin movin calm not_sta... 360
1 Drake In My Feelings [Intro: Drake]\nTrap, TrapMoneyBenny\nThis shi... pop rap . Trap, TrapMoneyBenny. This shit got me in my... [trap, trapmoneybenny, shit, feeling, real, ki... trap trapmoneybenny shit feeling real kiki lov... 574
2 Drake Hotline Bling [Intro]\nYou used to call me on my\nYou used t... pop rap . You used to call me on my. You used to, you ... [used, call, used, used, used, call, cell, pho... used call used used used call cell phone need ... 449
3 Drake One Dance [Intro: Kyla]\nBaby, I like your style\n\n[Ver... pop rap . Baby, I like your style. . . Grips on your w... [baby, like, style, grip, waist, front, back, ... baby like style grip waist front back know not... 416
4 Drake Hold On, We’re Going Home [Produced by Nineteen85, Majid Jordan & Noah "... pop rap . . . I got my eyes on you. You're everything ... [eye, everything, want, love, emotion, endless... eye everything want love emotion endlessly not... 348
In [128]:
df_lyrics.shape
Out[128]:
(16154, 8)
In [129]:
df_lyrics.to_csv(PATH_LYRICS_CLEAN)

Section 3: Data Analysis

In [130]:
df_tracks = pd.read_csv(PATH_TRACKS_CLEAN, index_col=0, converters = {"Artist Names": ast.literal_eval})
df_tracks.head()
Out[130]:
Track Name Artist Names Album Name Duration in ms Explicit Popularity Track ID Artist IDs Album ID Href URI Acousticness Danceability Energy Instrumentalness Liveness Loudness Speechiness Valence Tempo Time Signature Key Mode Release Date Genres Artist ID Artist_ID Genre
0 Deep Pockets [Drake] Dark Lane Demo Tapes 222928 True 63 3IvMYBE7A3c7to1aEcfFJk ['3TVXtAsR1Inumwj472S9r4'] 6OQ9gBfg5EXeNAEwGSs6jK https://api.spotify.com/v1/tracks/3IvMYBE7A3c7... spotify:track:3IvMYBE7A3c7to1aEcfFJk 0.482 0.473 0.824 0.000000 0.6050 -3.680 0.163 0.3740 77.888 4.0 7.0 0.0 2020-05-01 ['canadian hip hop', 'canadian pop', 'hip hop'... 3TVXtAsR1Inumwj472S9r4 3TVXtAsR1Inumwj472S9r4 pop rap
1 When To Say When [Drake] Dark Lane Demo Tapes 223124 True 65 5TCBWmEBrin7etRa4Lswr1 ['3TVXtAsR1Inumwj472S9r4'] 6OQ9gBfg5EXeNAEwGSs6jK https://api.spotify.com/v1/tracks/5TCBWmEBrin7... spotify:track:5TCBWmEBrin7etRa4Lswr1 0.252 0.410 0.820 0.000000 0.5380 -6.808 0.533 0.5260 170.718 4.0 1.0 1.0 2020-05-01 ['canadian hip hop', 'canadian pop', 'hip hop'... 3TVXtAsR1Inumwj472S9r4 3TVXtAsR1Inumwj472S9r4 pop rap
2 Chicago Freestyle (feat. Giveon) [Drake, Giveon] Dark Lane Demo Tapes 220487 True 84 4wVOKKEHUJxHCFFNUWDn0B ['3TVXtAsR1Inumwj472S9r4', '4fxd5Ee7UefO4CUXgw... 6OQ9gBfg5EXeNAEwGSs6jK https://api.spotify.com/v1/tracks/4wVOKKEHUJxH... spotify:track:4wVOKKEHUJxHCFFNUWDn0B 0.629 0.735 0.449 0.000000 0.1130 -7.507 0.347 0.0397 122.947 4.0 10.0 1.0 2020-05-01 ['canadian hip hop', 'canadian pop', 'hip hop'... 3TVXtAsR1Inumwj472S9r4 3TVXtAsR1Inumwj472S9r4 pop rap
3 Not You Too (feat. Chris Brown) [Drake, Chris Brown] Dark Lane Demo Tapes 269680 True 68 3Q4gttWQ6hxqWOa3tHoTNi ['3TVXtAsR1Inumwj472S9r4', '7bXgB6jMjp9ATFy66e... 6OQ9gBfg5EXeNAEwGSs6jK https://api.spotify.com/v1/tracks/3Q4gttWQ6hxq... spotify:track:3Q4gttWQ6hxqWOa3tHoTNi 0.342 0.458 0.452 0.000019 0.0703 -9.299 0.047 0.3160 86.318 4.0 9.0 0.0 2020-05-01 ['canadian hip hop', 'canadian pop', 'hip hop'... 3TVXtAsR1Inumwj472S9r4 3TVXtAsR1Inumwj472S9r4 pop rap
4 Toosie Slide [Drake] Dark Lane Demo Tapes 247058 True 80 466cKvZn1j45IpxDdYZqdA ['3TVXtAsR1Inumwj472S9r4'] 6OQ9gBfg5EXeNAEwGSs6jK https://api.spotify.com/v1/tracks/466cKvZn1j45... spotify:track:466cKvZn1j45IpxDdYZqdA 0.289 0.830 0.490 0.000003 0.1130 -8.820 0.209 0.8450 81.604 4.0 1.0 0.0 2020-05-01 ['canadian hip hop', 'canadian pop', 'hip hop'... 3TVXtAsR1Inumwj472S9r4 3TVXtAsR1Inumwj472S9r4 pop rap
In [131]:
df_tracks['Explicit'] = df_tracks["Explicit"].astype(int)
In [132]:
df_tracks['Year'] = pd.DatetimeIndex(df_tracks['Release Date']).year

Investigate Audio Features and their relationship to popularity

The selected audio features are numeric values provided by the Spotify API

In [133]:
audio_features = ['Popularity', 'Acousticness', 'Danceability', 'Energy', 'Instrumentalness', 'Liveness', 'Loudness',
                  'Speechiness', 'Valence', 'Tempo', 'Key', 'Mode', 'Year', 'Explicit']

df_tracks_af = df_tracks[audio_features]

# Drop tracks for which audio features are not available
df_tracks_af = df_tracks_af.dropna(axis=0)
In [134]:
plt.figure(figsize=(20, 10))
sns.heatmap(df_tracks_af.corr(),annot = True, fmt='.1g',square=True)
Out[134]:
<AxesSubplot:>

We can use a correlation plot to check how the audio feature are related with popularity

In [135]:
corr = np.abs(df_tracks_af.corr())
series = np.abs(corr['Popularity']).sort_values(ascending=False)
print('The most linear correlated features to POPULARITY are:')
for i, row in enumerate(series):
    if 0.2 <= row < 1:
      print(f'{series.index[i]:17} --> {row: .2f} (abs)')
The most linear correlated features to POPULARITY are:
Loudness          -->  0.38 (abs)
Acousticness      -->  0.30 (abs)
Instrumentalness  -->  0.27 (abs)
Energy            -->  0.26 (abs)
Danceability      -->  0.23 (abs)
Explicit          -->  0.21 (abs)

Also, possible outliers are detected and, if present, dropped

In [136]:
plt.figure(figsize=(16, 10))

for i in range(len(df_tracks_af.columns)):
    plt.subplot(3, 5, i + 1)
    sns.boxplot(df_tracks_af[df_tracks_af.columns[i]])

plt.show()
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
In [137]:
def outliers_count(df, threshold):
    #function that, starting from a data frame and a treshold detects
    #returns number of outliers for each column.

    df = df_tracks_af.copy()
    
    # Get the z-score for specified threshold
    threshold_z_score = stats.norm.ppf(threshold)
    
    # Get the z-scores for each value in df
    z_score_df = pd.DataFrame(np.abs(stats.zscore(df)), columns=df.columns)
    
    # Compare df z_scores to the threshold and return the count of outliers in each column
    return (z_score_df > threshold_z_score).sum(axis=0)
In [138]:
outliers_count(df_tracks_af, 0.99999)
Out[138]:
Popularity             0
Acousticness           0
Danceability           0
Energy                 0
Instrumentalness       0
Liveness               0
Loudness             356
Speechiness         2714
Valence                0
Tempo                  1
Key                    0
Mode                   0
Year                 122
Explicit               0
dtype: int64
In [139]:
def outliers_cleaner(df, threshold):
    #function that starting from a datafram and a given treshold for outliers,
    #return a datafram without the outliers identified
    
    df = df.copy()
    
    # Get the z-score for specified threshold
    threshold_z_score = stats.norm.ppf(threshold)
    
    # Get the z-scores for each value in df
    z_score_df = pd.DataFrame(np.abs(stats.zscore(df)), columns=df.columns)
    z_score_df = z_score_df > threshold_z_score
    
    # Get indices of the outliers
    outliers = z_score_df.sum(axis=1)
    outliers = outliers > 0
    outlier_indices = df.index[outliers]
    
    # Drop outlier examples
    df = df.drop(outlier_indices, axis=0).reset_index(drop=True)
    
    return df
In [140]:
df_tracks_af_clean = outliers_cleaner(df_tracks_af,0.99999)

In the following section some of the most relevant music feature are analyzed to better undestand how they affect popularity

Acousticness
In [141]:
fig, ax = plt.subplots(figsize=(16, 4))
sns.distplot(df_tracks['Acousticness'], kde=False, bins=30)
plt.show()
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
In [142]:
fig, ax = plt.subplots(figsize=(15, 6))
ax1_data =  df_tracks.groupby('Acousticness')['Popularity'].mean().to_frame().reset_index()
ax = sns.scatterplot(x = ax1_data['Acousticness'], y = ax1_data['Popularity'], color='blue', ax=ax)
ax.set_title('Acousticness vs. Mean Popularity')
ax.set_ylabel('Mean Popularity', fontsize=12)
plt.tight_layout()
plt.show()

It seems that the less acustic a song is, the more it is popular on average

Loudness
In [143]:
fig, ax = plt.subplots(figsize=(16, 4))
sns.distplot(df_tracks['Loudness'], kde=False, bins=30)
plt.show()
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
In [144]:
fig, ax = plt.subplots(figsize=(15, 6))
ax1_data =  df_tracks.groupby('Loudness')['Popularity'].mean().to_frame().reset_index()
ax = sns.scatterplot(x = ax1_data['Loudness'], y = ax1_data['Popularity'], color='blue', ax=ax)
ax.set_title('Loudness vs. Mean Popularity')
ax.set_ylabel('Mean Popularity', fontsize=12)
plt.tight_layout()
plt.show()

Values of Loudness between 0 and -5 seem to be the most popular ones. Also, it is not uncommon for songs to be in that range of Loudness

Instrumentalness
In [145]:
fig, ax = plt.subplots(figsize=(16, 4))
sns.distplot(df_tracks['Instrumentalness'], kde=False, bins=30)
plt.show()
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
In [146]:
fig, ax = plt.subplots(figsize=(15, 6))
ax1_data =  df_tracks.groupby('Instrumentalness')['Popularity'].mean().to_frame().reset_index()
ax = sns.scatterplot(x = ax1_data['Instrumentalness'], y = ax1_data['Popularity'], color='blue', ax=ax)
ax.set_title('Instrumentalness vs. Mean Popularity')
ax.set_ylabel('Mean Popularity', fontsize=12)
plt.tight_layout()
plt.show()

The vast majority of the tracks seem to have an Instrumentalness value close to 0

Energy
In [147]:
fig, ax = plt.subplots(figsize=(16, 4))
sns.distplot(df_tracks['Energy'], kde=False, bins=30)
plt.show()
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
In [148]:
fig, ax = plt.subplots(figsize=(15, 6))
ax1_data =  df_tracks.groupby('Energy')['Popularity'].mean().to_frame().reset_index()
ax = sns.scatterplot(x = ax1_data['Energy'], y = ax1_data['Popularity'], color='blue', ax=ax)
ax.set_title('Energy vs. Mean Popularity')
ax.set_ylabel('Mean Popularity', fontsize=12)
plt.tight_layout()
plt.show()

Energy values are more equally distributed. Between 0.2 and 0.8 there is a linear relationship between Popularity and Danceability

Danceability
In [149]:
fig, ax = plt.subplots(figsize=(16, 4))
sns.distplot(df_tracks['Danceability'], kde=False, bins=30)
plt.show()
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
In [150]:
fig, ax = plt.subplots(figsize=(15, 6))
ax1_data =  df_tracks.groupby('Danceability')['Popularity'].mean().to_frame().reset_index()
ax = sns.scatterplot(x = ax1_data['Danceability'], y = ax1_data['Popularity'], color='blue', ax=ax)
ax.set_title('Danceability vs. Mean Popularity')
ax.set_ylabel('Mean Popularity', fontsize=12)
plt.tight_layout()
plt.show()

Despite the normal distribution, like energy, danceability seems to be lienarly realted with popularty between 0.2 and 0.8

Popularity
In [151]:
df_tracks['Popularity'].describe()
Out[151]:
count    172202.000000
mean         30.446929
std          16.770288
min           6.000000
25%          17.000000
50%          28.000000
75%          42.000000
max          96.000000
Name: Popularity, dtype: float64
In [152]:
fig, ax = plt.subplots(figsize=(16, 4))
sns.distplot(df_tracks['Popularity'], kde=False, bins=30)
plt.show()
/Users/Janiwo/opt/anaconda3/envs/spotipy_env/lib/python3.7/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
In [153]:
fig, ax = plt.subplots(figsize=(20, 4))
ax = df_tracks.groupby('Year')['Track Name'].count().plot()
ax.set_title('Number of tracks over the years', weight='bold')
ax.set_ylabel('Number of Tracks', weight='bold')
ax.set_xlabel('Year', weight='bold')
ax.set_xticks(range(1920, 2021, 5))
plt.show()

The vast majority of the songs are recent. Also, very few song have high level of popularity

In [154]:
df_tracks.groupby('Year')['Track Name'].count()
Out[154]:
Year
1885        9
1923       33
1925        9
1926       23
1928       29
        ...  
2016     9068
2017    10665
2018    10894
2019    11230
2020    14845
Name: Track Name, Length: 82, dtype: int64
In [155]:
frequency_year = df_tracks.groupby('Year')['Track Name'].count().reset_index()
frequency_year.columns = ['Year','Number of tracks']
frequency_year['%'] = (frequency_year['Number of tracks']/(frequency_year['Number of tracks'].sum()))*100
frequency_year
Out[155]:
Year Number of tracks %
0 1885 9 0.005226
1 1923 33 0.019164
2 1925 9 0.005226
3 1926 23 0.013356
4 1928 29 0.016841
... ... ... ...
77 2016 9068 5.265909
78 2017 10665 6.193308
79 2018 10894 6.326291
80 2019 11230 6.521411
81 2020 14845 8.620690

82 rows × 3 columns

In [156]:
fig, ax = plt.subplots(figsize=(20, 4))
ax = df_tracks.groupby('Year')['Popularity'].max().plot()
ax.set_title('Max Popularity over the years', c='r', weight='bold')
ax.set_ylabel('Max Popularity', weight='bold')
ax.set_xlabel('Year', weight='bold')
ax.set_xticks(range(1920, 2021, 5))
plt.show()
In [157]:
fig, ax = plt.subplots(figsize = (12, 10))
top_songs = df_tracks.groupby('Track Name')['Popularity'].max().sort_values(ascending=False).head(25)
ax = sns.barplot(x=top_songs.values, y=top_songs.index, orient="h", edgecolor='black', ax=ax)
ax.set_xlabel('Popularity', fontsize=12)
ax.set_ylabel('Track', fontsize=12)
ax.set_title('Most Popular Tracks', fontsize=14, weight = 'bold')
plt.show()

SML

Setting the values for split test base on the findings of EDA and correlation plot

In [158]:
y = df_tracks_af_clean['Popularity']
X_all = df_tracks_af_clean.drop('Popularity', axis=1) #including all the numeric features
X = df_tracks_af_clean.drop(['Popularity','Key','Tempo','Mode',
                       'Liveness','Speechiness', 'Valence','Year'], 
                        axis=1) #including only features selected in EDA

Defining the X variables for Statsmodel, which requires a constant added to the data frame

In [159]:
X_stats = sm.add_constant(X)
X_all_stats = sm.add_constant(X_all)

Popularity prediction including all the years in the data frame

The first attempt in popularity prediction aims to predict popularityover all the years present in the data

All acustic features

The model is firstly fitted with all the music features

In [160]:
X_train, X_test, y_train, y_test = train_test_split(X_all_stats, y, test_size=0.2, random_state=RANDOM)
In [161]:
X_train.describe()
Out[161]:
const Acousticness Danceability Energy Instrumentalness Liveness Loudness Speechiness Valence Tempo Key Mode Year Explicit
count 135201.0 135201.000000 135201.000000 135201.000000 135201.000000 135201.000000 135201.000000 135201.000000 135201.000000 135201.000000 135201.000000 135201.000000 135201.000000 135201.000000
mean 1.0 0.361195 0.537000 0.578660 0.147786 0.268039 -9.747458 0.093464 0.468747 119.322430 5.232417 0.675735 2005.699203 0.158401
std 0.0 0.350785 0.183569 0.269139 0.302962 0.254684 5.979437 0.103104 0.254064 30.060773 3.564546 0.468101 15.249740 0.365118
min 1.0 0.000000 0.000000 0.000020 0.000000 0.000000 -36.119000 0.000000 0.000000 0.000000 0.000000 0.000000 1942.000000 0.000000
25% 1.0 0.035900 0.406000 0.384000 0.000000 0.102000 -12.007000 0.035900 0.262000 95.255000 2.000000 0.000000 2000.000000 0.000000
50% 1.0 0.228000 0.546000 0.620000 0.000052 0.148000 -7.878000 0.048600 0.458000 119.396000 5.000000 1.000000 2012.000000 0.000000
75% 1.0 0.684000 0.674000 0.804000 0.036500 0.335000 -5.620000 0.095900 0.667000 138.779000 8.000000 1.000000 2017.000000 0.000000
max 1.0 0.996000 0.987000 0.999000 0.999000 1.000000 0.496000 0.717000 1.000000 243.507000 11.000000 1.000000 2020.000000 1.000000
In [162]:
model = sm.OLS(y_train, X_train).fit()
In [163]:
model_summary=model.summary()
print(model_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             Popularity   R-squared:                       0.214
Model:                            OLS   Adj. R-squared:                  0.214
Method:                 Least Squares   F-statistic:                     2826.
Date:                Tue, 02 Feb 2021   Prob (F-statistic):               0.00
Time:                        21:32:33   Log-Likelihood:            -5.5698e+05
No. Observations:              135201   AIC:                         1.114e+06
Df Residuals:                  135187   BIC:                         1.114e+06
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const             -152.0211      5.671    -26.807      0.000    -163.136    -140.906
Acousticness        -4.0840      0.192    -21.225      0.000      -4.461      -3.707
Danceability         4.0282      0.306     13.172      0.000       3.429       4.628
Energy              -4.8886      0.345    -14.151      0.000      -5.566      -4.212
Instrumentalness    -6.6260      0.174    -38.128      0.000      -6.967      -6.285
Liveness            -9.2914      0.175    -53.014      0.000      -9.635      -8.948
Loudness             0.9102      0.014     63.756      0.000       0.882       0.938
Speechiness        -11.6120      0.462    -25.115      0.000     -12.518     -10.706
Valence             -4.9041      0.206    -23.862      0.000      -5.307      -4.501
Tempo                0.0047      0.001      3.358      0.001       0.002       0.007
Key                 -0.0128      0.012     -1.114      0.265      -0.035       0.010
Mode                -0.7407      0.089     -8.327      0.000      -0.915      -0.566
Year                 0.0996      0.003     35.293      0.000       0.094       0.105
Explicit             4.6823      0.134     34.840      0.000       4.419       4.946
==============================================================================
Omnibus:                     3965.139   Durbin-Watson:                   2.004
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             4314.447
Skew:                           0.436   Prob(JB):                         0.00
Kurtosis:                       2.916   Cond. No.                     2.81e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.81e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

The model can explain less than 1/4 of the dependand variable with all the music features. Key seems to not be statistically relevant.

In [164]:
predictions = model.predict(X_test)
In [165]:
mse = sm.tools.eval_measures.mse(y_test, predictions)
rmse = np.sqrt(mse)
print('Mean squared error: %.2f'
% mse)
print ('Root mean squared error: %.2f'
% rmse)
Mean squared error: 223.12
Root mean squared error: 14.94

The general model can predict popularity with a RMSE of 14.91, not too accuarate in a scale from 0 to 100

In [166]:
# Plot outputs
fig, ax = plt.subplots(figsize=(6, 6))
ax = sns.scatterplot(x=y_test, y=predictions)
sns.lineplot(x=y_train, y=y_train, color='black', ax=ax)
ax.set_xlabel('Y_test')
ax.set_ylabel('Y_pred')
ax.set_title('y_test vs. y_pred', fontsize=14, color='black')
plt.show()
Only relevant features

The second model is fitted with only the music features recognized relevant in the EDA section

In [167]:
X_train, X_test, y_train, y_test = train_test_split(X_stats, y, test_size=0.2, random_state=RANDOM)
In [168]:
model = sm.OLS(y_train, X_train).fit()
In [169]:
model_summary=model.summary()
print(model_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             Popularity   R-squared:                       0.180
Model:                            OLS   Adj. R-squared:                  0.180
Method:                 Least Squares   F-statistic:                     4953.
Date:                Tue, 02 Feb 2021   Prob (F-statistic):               0.00
Time:                        21:32:42   Log-Likelihood:            -5.5980e+05
No. Observations:              135201   AIC:                         1.120e+06
Df Residuals:                  135194   BIC:                         1.120e+06
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const               49.8843      0.367    135.892      0.000      49.165      50.604
Acousticness        -6.2081      0.190    -32.620      0.000      -6.581      -5.835
Danceability         3.5351      0.259     13.626      0.000       3.027       4.044
Energy             -13.9914      0.316    -44.292      0.000     -14.610     -13.372
Instrumentalness    -3.0138      0.170    -17.698      0.000      -3.348      -2.680
Loudness             1.1373      0.014     80.399      0.000       1.110       1.165
Explicit             4.7645      0.121     39.485      0.000       4.528       5.001
==============================================================================
Omnibus:                     4616.973   Durbin-Watson:                   2.004
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             5048.808
Skew:                           0.467   Prob(JB):                         0.00
Kurtosis:                       2.842   Cond. No.                         137.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [170]:
predictions = model.predict(X_test)
resids = model.resid
In [171]:
mse = sm.tools.eval_measures.mse(y_test, predictions)
rmse = np.sqrt(mse)
print('Mean squared error: %.2f'
% mse)
print ('Root mean squared error: %.2f'
% rmse)
Mean squared error: 232.86
Root mean squared error: 15.26
In [172]:
# Plot outputs
fig, ax = plt.subplots(figsize=(6, 6))
ax = sns.scatterplot(x=y_test, y=predictions)
sns.lineplot(x=y_train, y=y_train, color='black', ax=ax)
ax.set_xlabel('Y_test')
ax.set_ylabel('Y_pred')
ax.set_title('y_test vs. y_pred', fontsize=14, color='black')
plt.show()

Popularity prediction only including 2020 songs

As analyzed in the EDA for popularity prediction, the release year seems to be a very influential variable. Popularity is calculated starting from the number of 'plays' a song got. Therefore, most recent songs are more likely to be popular as they are likely to be played by a higher number of listeners. Also, the vast majority of songs are from 2020. Therefore, popularity will be predicted considering only songs from 2020. The reasons for this choice are two folds. On the one hand, considering all the years does not seem to provide quality results. On the other hand, as popularity seems highly related to how recent a song is, predicting popularity starting from recent songs, seems reasonable.

In [173]:
df_tracks_af_2020 = df_tracks_af_clean[df_tracks_af_clean['Year'] == 2020]
In [174]:
df_tracks_af_2020.describe()
Out[174]:
Popularity Acousticness Danceability Energy Instrumentalness Liveness Loudness Speechiness Valence Tempo Key Mode Year Explicit
count 14622.000000 14622.000000 14622.000000 14622.000000 14622.000000 14622.000000 14622.000000 14622.000000 14622.000000 14622.000000 14622.000000 14622.000000 14622.0 14622.000000
mean 30.630146 0.556527 0.495932 0.430665 0.321808 0.207852 -13.668466 0.086960 0.426540 115.147118 5.135070 0.658460 2020.0 0.162700
std 20.391778 0.402914 0.203014 0.302976 0.404525 0.198454 8.392953 0.096804 0.260954 30.938016 3.514596 0.474242 0.0 0.369104
min 6.000000 0.000002 0.000000 0.000535 0.000000 0.021100 -36.117000 0.000000 0.000000 0.000000 0.000000 0.000000 2020.0 0.000000
25% 12.250000 0.108000 0.337000 0.137000 0.000000 0.099600 -20.568500 0.038000 0.200000 90.327000 2.000000 0.000000 2020.0 0.000000
50% 25.000000 0.661000 0.487000 0.418000 0.005850 0.126000 -10.835500 0.046900 0.400000 114.208500 5.000000 1.000000 2020.0 0.000000
75% 47.000000 0.972000 0.656000 0.695000 0.850000 0.229000 -6.441500 0.079800 0.633000 135.968750 8.000000 1.000000 2020.0 0.000000
max 96.000000 0.996000 0.977000 0.999000 0.996000 1.000000 0.654000 0.716000 0.990000 237.914000 11.000000 1.000000 2020.0 1.000000
In [175]:
df_tracks_af_clean.describe()
Out[175]:
Popularity Acousticness Danceability Energy Instrumentalness Liveness Loudness Speechiness Valence Tempo Key Mode Year Explicit
count 169002.000000 169002.000000 169002.000000 169002.000000 169002.000000 169002.000000 169002.000000 169002.000000 169002.000000 169002.000000 169002.000000 169002.000000 169002.000000 169002.000000
mean 30.643827 0.360840 0.537410 0.578623 0.147978 0.267935 -9.748699 0.093446 0.468519 119.309824 5.227654 0.675388 2005.689915 0.158004
std 16.801719 0.350486 0.183345 0.269069 0.302975 0.254411 5.985740 0.103177 0.253915 30.044546 3.565180 0.468231 15.255007 0.364746
min 6.000000 0.000000 0.000000 0.000020 0.000000 0.000000 -36.119000 0.000000 0.000000 0.000000 0.000000 0.000000 1942.000000 0.000000
25% 17.000000 0.036100 0.407000 0.384000 0.000000 0.102000 -12.004750 0.035900 0.262000 95.254250 2.000000 0.000000 2000.000000 0.000000
50% 29.000000 0.228000 0.547000 0.620000 0.000054 0.148000 -7.875500 0.048500 0.458000 119.478000 5.000000 1.000000 2012.000000 0.000000
75% 42.000000 0.683000 0.674000 0.804000 0.037100 0.335000 -5.620000 0.095900 0.667000 138.725750 8.000000 1.000000 2017.000000 0.000000
max 96.000000 0.996000 0.989000 1.000000 0.999000 1.000000 0.654000 0.717000 1.000000 243.507000 11.000000 1.000000 2020.000000 1.000000
In [176]:
y = df_tracks_af_2020['Popularity']
X_all = df_tracks_af_2020.drop('Popularity', axis=1) #including all the numeric features
X = df_tracks_af_2020.drop(['Popularity','Key','Tempo','Mode',
                       'Liveness','Speechiness', 'Valence', 'Year'], 
                        axis=1) #including only features selected in EDA

Defining the X variables for Statsmodel, which requires a constant added to the data frame

In [177]:
X_stats = sm.add_constant(X)
X_all_stats = sm.add_constant(X_all)
All features
In [178]:
X_train, X_test, y_train, y_test = train_test_split(X_all_stats, y, test_size=0.2, random_state=RANDOM)
In [179]:
model = sm.OLS(y_train, X_train).fit()
In [180]:
model_summary=model.summary()
print(model_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             Popularity   R-squared:                       0.555
Model:                            OLS   Adj. R-squared:                  0.554
Method:                 Least Squares   F-statistic:                     1212.
Date:                Tue, 02 Feb 2021   Prob (F-statistic):               0.00
Time:                        21:32:54   Log-Likelihood:                -47142.
No. Observations:               11697   AIC:                         9.431e+04
Df Residuals:                   11684   BIC:                         9.441e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Acousticness       -10.2262      0.681    -15.022      0.000     -11.561      -8.892
Danceability        15.6369      0.982     15.917      0.000      13.711      17.563
Energy              -7.2453      1.159     -6.252      0.000      -9.517      -4.974
Instrumentalness    -6.5048      0.450    -14.467      0.000      -7.386      -5.623
Liveness            -4.2335      0.703     -6.023      0.000      -5.611      -2.856
Loudness             0.8324      0.037     22.223      0.000       0.759       0.906
Speechiness          6.8187      1.583      4.306      0.000       3.715       9.923
Valence             -6.5128      0.595    -10.937      0.000      -7.680      -5.346
Tempo                0.0128      0.004      2.963      0.003       0.004       0.021
Key                  0.0463      0.036      1.281      0.200      -0.025       0.117
Mode                -0.2789      0.271     -1.029      0.303      -0.810       0.252
Year                 0.0221      0.001     33.575      0.000       0.021       0.023
Explicit            11.8471      0.430     27.571      0.000      11.005      12.689
==============================================================================
Omnibus:                      184.460   Durbin-Watson:                   2.002
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              214.812
Skew:                           0.256   Prob(JB):                     2.26e-47
Kurtosis:                       3.423   Cond. No.                     2.59e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.59e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

The model can now explain 55.2% of the dependand variable with all the music features.A signifcant improvement from the model considering all the years. Also, in this new model it's possible to see how Key, Tempo and Mode don't seem to be statistically relevant

In [181]:
predictions = model.predict(X_test)
In [182]:
mse = sm.tools.eval_measures.mse(y_test, predictions)
rmse = np.sqrt(mse)
print('Mean squared error: %.2f'
% mse)
print ('Root mean squared error: %.2f'
% rmse)
Mean squared error: 190.97
Root mean squared error: 13.82

The general model can predict popularity with a RMSE of 14.91, not too accuarate in a scale from 0 to 100

In [183]:
# Plot outputs
fig, ax = plt.subplots(figsize=(6, 6))
ax = sns.scatterplot(x=y_test, y=predictions)
sns.lineplot(x=y_train, y=y_train, color='black', ax=ax)
ax.set_xlabel('Y_test')
ax.set_ylabel('Y_pred')
ax.set_title('y_test vs. y_pred', fontsize=14, color='black')
plt.show()
In [184]:
residuals = y_test - predictions
In [185]:
sns.mpl.rcParams['figure.figsize'] = (15.0, 7.0)
fig, ax = plt.subplots(1,2 )
    
sns.regplot(x=predictions, y=y_test, lowess=True, ax=ax[0], line_kws={'color': 'red'})
ax[0].set_title('Observed vs. Predicted Values', fontsize=16)
ax[0].set(xlabel='Predicted', ylabel='Observed')

sns.regplot(x=predictions, y=residuals, lowess=True, ax=ax[1], line_kws={'color': 'red'})
ax[1].set_title('Residuals vs. Predicted Values', fontsize=16)
ax[1].set(xlabel='Predicted', ylabel='Residuals')
Out[185]:
[Text(0.5, 0, 'Predicted'), Text(0, 0.5, 'Residuals')]
Only relevant features
In [186]:
X_train, X_test, y_train, y_test = train_test_split(X_stats, y, test_size=0.2, random_state=RANDOM)
In [187]:
model = sm.OLS(y_train, X_train).fit()
In [188]:
model_summary=model.summary()
print(model_summary)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             Popularity   R-squared:                       0.548
Model:                            OLS   Adj. R-squared:                  0.547
Method:                 Least Squares   F-statistic:                     2359.
Date:                Tue, 02 Feb 2021   Prob (F-statistic):               0.00
Time:                        21:33:04   Log-Likelihood:                -47233.
No. Observations:               11697   AIC:                         9.448e+04
Df Residuals:                   11690   BIC:                         9.453e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const               48.0766      1.173     40.985      0.000      45.777      50.376
Acousticness       -12.6755      0.653    -19.418      0.000     -13.955     -11.396
Danceability        12.4202      0.849     14.637      0.000      10.757      14.083
Energy             -11.5308      1.078    -10.700      0.000     -13.643      -9.418
Instrumentalness    -6.0134      0.445    -13.508      0.000      -6.886      -5.141
Loudness             0.8694      0.037     23.366      0.000       0.796       0.942
Explicit            13.2031      0.394     33.533      0.000      12.431      13.975
==============================================================================
Omnibus:                      214.321   Durbin-Watson:                   2.000
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              242.895
Skew:                           0.292   Prob(JB):                     1.80e-53
Kurtosis:                       3.397   Cond. No.                         205.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [189]:
predictions = model.predict(X_test)
residuals = y_test - predictions
In [190]:
mse = sm.tools.eval_measures.mse(y_test, predictions)
rmse = np.sqrt(mse)
print('Mean squared error: %.2f'
% mse)
print ('Root mean squared error: %.2f'
% rmse)
Mean squared error: 192.90
Root mean squared error: 13.89
In [191]:
# Plot outputs
fig, ax = plt.subplots(figsize=(6, 6))
ax = sns.scatterplot(x=y_test, y=predictions)
sns.lineplot(x=y_train, y=y_train, color='black', ax=ax)
ax.set_xlabel('Y_test')
ax.set_ylabel('Y_pred')
ax.set_title('y_test vs. y_pred', fontsize=14, color='black')
plt.show()
In [192]:
sns.mpl.rcParams['figure.figsize'] = (15, 7)
fig, ax = plt.subplots(1,2 )
    
sns.regplot(x=predictions, y=y_test, lowess=True, ax=ax[0], line_kws={'color': 'red'})
ax[0].set_title('Observed vs. Predicted Values', fontsize=16)
ax[0].set(xlabel='Predicted', ylabel='Observed')

sns.regplot(x=predictions, y=residuals, lowess=True, ax=ax[1], line_kws={'color': 'red'})
ax[1].set_title('Residuals vs. Predicted Values', fontsize=16)
ax[1].set(xlabel='Predicted', ylabel='Residuals')
Out[192]:
[Text(0.5, 0, 'Predicted'), Text(0, 0.5, 'Residuals')]

It seems that none of the models does not follow a linear behaviour in predicting popularity between 0 and 30. Also, despite the improvements accurate prediction is not possible yet. Therefore some more models will be considered. Lasso and Ridge regression could reduce erros, so they will be tested next

Lasso regression with all music features
In [193]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size = 0.3, random_state=RANDOM)
In [194]:
#grid search for hypertuning parameters
param_grid = {'alpha': np.arange(0.1, 50)} 
lasso = Lasso()
lasso_cv = GridSearchCV(lasso, param_grid, cv=5) 
lasso_cv.fit(X_train, y_train)
lasso_cv.best_params_
Out[194]:
{'alpha': 0.1}
In [195]:
#lasso
lasso = Lasso(alpha=0.1) 
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test) 
lasso.score(X_test, y_test)
Out[195]:
0.5369551782831212
In [196]:
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, lasso_pred))
print('Mean RMSE: %.2f'
      % np.sqrt(metrics.mean_squared_error(y_test,lasso_pred)))
print('Coefficient of determination (1 is perfect prediction): %.4f'
      % r2_score(y_test,lasso_pred))
Mean squared error: 192.51
Mean RMSE: 13.87
Coefficient of determination (1 is perfect prediction): 0.5370

Lasso regretion only relevant features

In [197]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=RANDOM)
In [198]:
#grid search
param_grid = {'alpha': np.arange(0.1, 50)} 
lasso = Lasso(normalize=False)
lasso_cv = GridSearchCV(lasso, param_grid, cv=5) 
lasso_cv.fit(X_train, y_train)
lasso_cv.best_params_
Out[198]:
{'alpha': 0.1}
In [199]:
lasso = Lasso(alpha=0.1, normalize=False) 
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test) 
lasso.score(X_test, y_test)
Out[199]:
0.531067905234481
In [200]:
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, lasso_pred))
print('Mean RMSE: %.2f'
      % np.sqrt(metrics.mean_squared_error(y_test,lasso_pred)))
print('Coefficient of determination (1 is perfect prediction): %.4f'
      % r2_score(y_test,lasso_pred))
Mean squared error: 194.96
Mean RMSE: 13.96
Coefficient of determination (1 is perfect prediction): 0.5311

Ridge no scaled only relevenat

In [201]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.3, random_state=RANDOM) 
In [202]:
#grid search
param_grid = {'alpha': np.arange(0.1, 40)} 
ridge = Ridge(normalize=False)
ridge_cv = GridSearchCV(ridge, param_grid, cv=5) 
ridge_cv.fit(X, y)
ridge_cv.best_params_
Out[202]:
{'alpha': 39.1}
In [203]:
ridge = Ridge(alpha=39.1, normalize=False) 
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test) 
ridge.score(X_test, y_test)
Out[203]:
0.5341523811800759
In [204]:
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, ridge_pred))
print('Mean RMSE: %.2f'
      % np.sqrt(metrics.mean_squared_error(y_test,ridge_pred)))
print('Coefficient of determination (1 is perfect prediction): %.4f'
      % r2_score(y_test,ridge_pred))
Mean squared error: 193.68
Mean RMSE: 13.92
Coefficient of determination (1 is perfect prediction): 0.5342

Ridge all

In [205]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y,test_size = 0.3, random_state=RANDOM)
In [206]:
#grid search
param_grid = {'alpha': np.arange(0.1, 40)} 
ridge = Ridge(normalize=False)
ridge_cv = GridSearchCV(ridge, param_grid, cv=5) 
ridge_cv.fit(X_all, y)
ridge_cv.best_params_
Out[206]:
{'alpha': 39.1}
In [207]:
ridge = Ridge(alpha=39.1, normalize=False) 
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test) 
ridge.score(X_test, y_test)
Out[207]:
0.5399697680807802
In [208]:
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, ridge_pred))
print('Mean RMSE: %.2f'
      % np.sqrt(metrics.mean_squared_error(y_test,ridge_pred)))
print('Coefficient of determination (1 is perfect prediction): %.4f'
      % r2_score(y_test,ridge_pred))
Mean squared error: 191.26
Mean RMSE: 13.83
Coefficient of determination (1 is perfect prediction): 0.5400

yellowbrick plots for linearity testing

Similarly to what has been done for the statsmodels' linear regression results, yellowbrick is now used to analyze the residual plots of both Lasso and Ridge regression, to understand if regularized linear regressors reduced non-linearity (more about the package function here https://www.scikit-yb.org/en/latest/api/regressor/residuals.html)

In [209]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=RANDOM)
model = Lasso(alpha=0.1)
visualizer = ResidualsPlot(model, hist=True)
visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
Out[209]:
0.5364687061790601
In [210]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM)
model = Lasso(alpha=0.1)
visualizer = ResidualsPlot(model, hist=True)
visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
Out[210]:
0.5309085901155718
In [211]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=RANDOM)
model = Ridge(39.1)
visualizer = ResidualsPlot(model, hist=True)
visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
Out[211]:
0.5385237764994839
In [212]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM)
model = Ridge(39.1)
visualizer = ResidualsPlot(model, hist=True)
visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
Out[212]:
0.5336078514251665

Similarly to what was observed in the Linear Regression estimators, also Lasso and Ridge do not follow a linear relationship for the lower values of popularity. Therefore, a not-linear regressor is used.

Random Forest Regressor with all features

In [213]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=RANDOM)
In [214]:
#grid search
param_grid = {'n_estimators': np.arange(25, 201, 25)} 
rf = RandomForestRegressor()
rf = GridSearchCV(rf, param_grid, cv=3, verbose=10) 
rf.fit(X_all, y)
rf.best_params_
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] n_estimators=25 .................................................
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] ..................... n_estimators=25, score=0.131, total=   2.0s
[CV] n_estimators=25 .................................................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s remaining:    0.0s
[CV] ..................... n_estimators=25, score=0.398, total=   1.8s
[CV] n_estimators=25 .................................................
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.8s remaining:    0.0s
[CV] ..................... n_estimators=25, score=0.346, total=   1.7s
[CV] n_estimators=50 .................................................
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.5s remaining:    0.0s
[CV] ..................... n_estimators=50, score=0.152, total=   3.9s
[CV] n_estimators=50 .................................................
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    9.4s remaining:    0.0s
[CV] ..................... n_estimators=50, score=0.401, total=   3.7s
[CV] n_estimators=50 .................................................
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   13.1s remaining:    0.0s
[CV] ..................... n_estimators=50, score=0.363, total=   3.4s
[CV] n_estimators=75 .................................................
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   16.6s remaining:    0.0s
[CV] ..................... n_estimators=75, score=0.149, total=   6.2s
[CV] n_estimators=75 .................................................
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   22.8s remaining:    0.0s
[CV] ..................... n_estimators=75, score=0.404, total=   5.5s
[CV] n_estimators=75 .................................................
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   28.3s remaining:    0.0s
[CV] ..................... n_estimators=75, score=0.366, total=   5.0s
[CV] n_estimators=100 ................................................
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   33.4s remaining:    0.0s
[CV] .................... n_estimators=100, score=0.152, total=   7.8s
[CV] n_estimators=100 ................................................
[CV] .................... n_estimators=100, score=0.409, total=   7.0s
[CV] n_estimators=100 ................................................
[CV] .................... n_estimators=100, score=0.365, total=   7.0s
[CV] n_estimators=125 ................................................
[CV] .................... n_estimators=125, score=0.150, total=  10.1s
[CV] n_estimators=125 ................................................
[CV] .................... n_estimators=125, score=0.411, total=   9.6s
[CV] n_estimators=125 ................................................
[CV] .................... n_estimators=125, score=0.368, total=   9.4s
[CV] n_estimators=150 ................................................
[CV] .................... n_estimators=150, score=0.152, total=  13.7s
[CV] n_estimators=150 ................................................
[CV] .................... n_estimators=150, score=0.411, total=  11.4s
[CV] n_estimators=150 ................................................
[CV] .................... n_estimators=150, score=0.373, total=  11.0s
[CV] n_estimators=175 ................................................
[CV] .................... n_estimators=175, score=0.153, total=  13.7s
[CV] n_estimators=175 ................................................
[CV] .................... n_estimators=175, score=0.409, total=  12.8s
[CV] n_estimators=175 ................................................
[CV] .................... n_estimators=175, score=0.369, total=  12.6s
[CV] n_estimators=200 ................................................
[CV] .................... n_estimators=200, score=0.151, total=  15.8s
[CV] n_estimators=200 ................................................
[CV] .................... n_estimators=200, score=0.411, total=  15.4s
[CV] n_estimators=200 ................................................
[CV] .................... n_estimators=200, score=0.371, total=  14.3s
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:  3.4min finished
Out[214]:
{'n_estimators': 150}
In [215]:
RF = RandomForestRegressor(n_estimators=150)
RF.fit(X_train,y_train)
pred = RF.predict(X_test)
In [216]:
mse=mean_squared_error(y_test,pred )
rmse = np.sqrt(mse)
print(mse)
print(rmse)
149.87141474775387
12.242198117485025
In [217]:
pd.DataFrame({'Variable':X_train.columns,
              'Importance':RF.feature_importances_}).sort_values('Importance', ascending=False)
Out[217]:
Variable Importance
5 Loudness 0.528696
12 Explicit 0.067034
3 Instrumentalness 0.055170
7 Valence 0.052844
1 Danceability 0.050968
0 Acousticness 0.050129
4 Liveness 0.048083
2 Energy 0.041756
6 Speechiness 0.041358
8 Tempo 0.040812
9 Key 0.018889
10 Mode 0.004262
11 Year 0.000000

The random forest regressor gives the lowest error. In order to obtain more information about how it reached its output. Firstly we use both the classifier-owned ‘feature importance’ method and eli5 to understand which features which are the features considered the most relevant by the model. Secondly, we calculate features’ contribution in affecting the weight result (https://towardsdatascience.com/machine-learning-explainability-introduction-via-eli5-99c767f017e2 )

In [218]:
#Permutation Importance
perm = PermutationImportance(RF ,random_state=RANDOM).fit(X_test, y_test)
show_weights(perm, feature_names = list(X_test.columns))
Out[218]:
Weight Feature
0.3472 ± 0.0215 Loudness
0.0895 ± 0.0168 Instrumentalness
0.0705 ± 0.0089 Acousticness
0.0649 ± 0.0046 Explicit
0.0582 ± 0.0058 Danceability
0.0398 ± 0.0032 Valence
0.0254 ± 0.0072 Energy
0.0170 ± 0.0029 Liveness
0.0119 ± 0.0024 Speechiness
0.0084 ± 0.0007 Tempo
0.0025 ± 0.0020 Key
0.0001 ± 0.0006 Mode
0 ± 0.0000 Year
In [219]:
show_weights(RF, feature_names = list(X_test.columns))
Out[219]:
Weight Feature
0.5287 ± 0.0150 Loudness
0.0670 ± 0.0086 Explicit
0.0552 ± 0.0076 Instrumentalness
0.0528 ± 0.0073 Valence
0.0510 ± 0.0087 Danceability
0.0501 ± 0.0081 Acousticness
0.0481 ± 0.0093 Liveness
0.0418 ± 0.0079 Energy
0.0414 ± 0.0081 Speechiness
0.0408 ± 0.0074 Tempo
0.0189 ± 0.0044 Key
0.0043 ± 0.0019 Mode
0 ± 0.0000 Year
In [220]:
show_prediction(RF, X_test.iloc[1],show_feature_values=True)
Out[220]:

y (score 52.147) top features

Contribution? Feature Value
+30.538 <BIAS> 1.000
+16.976 Loudness -2.211
+5.762 Tempo 160.054
+3.877 Instrumentalness 0.000
+3.113 Danceability 0.671
+1.288 Valence 0.659
+0.154 Acousticness 0.343
-0.084 Mode 1.000
-0.893 Liveness 0.345
-0.903 Energy 0.786
-1.004 Key 11.000
-2.115 Speechiness 0.221
-4.563 Explicit 0.000
In [221]:
# Plot outputs
fig, ax = plt.subplots(figsize=(6, 6))
ax = sns.scatterplot(x=y_test, y=pred)
sns.lineplot(x=y_train, y=y_train, color='black', ax=ax)
ax.set_xlabel('Y_test')
ax.set_ylabel('Y_pred')
ax.set_title('y_test vs. y_pred', fontsize=14, color='black')
plt.show()

Despite all the efforts in improving the models and the good improvements obtained so far it seems like it's still not possible for any model to predict popularity above 65. Nonethelss it was possible to identify which are the elements that affects popularity the most thanks to eli5.

Random Forest Regressor with selected features

In [222]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM)
In [223]:
#grid search
param_grid = {'n_estimators': np.arange(25, 201, 25)} 
rf = RandomForestRegressor()
rf = GridSearchCV(rf, param_grid, cv=3, verbose = 10) 
rf.fit(X_all, y)
rf.best_params_
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] n_estimators=25 .................................................
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] ..................... n_estimators=25, score=0.133, total=   2.4s
[CV] n_estimators=25 .................................................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.4s remaining:    0.0s
[CV] ..................... n_estimators=25, score=0.392, total=   2.0s
[CV] n_estimators=25 .................................................
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.3s remaining:    0.0s
[CV] ..................... n_estimators=25, score=0.347, total=   1.8s
[CV] n_estimators=50 .................................................
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    6.1s remaining:    0.0s
[CV] ..................... n_estimators=50, score=0.150, total=   4.4s
[CV] n_estimators=50 .................................................
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.6s remaining:    0.0s
[CV] ..................... n_estimators=50, score=0.408, total=   3.7s
[CV] n_estimators=50 .................................................
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.3s remaining:    0.0s
[CV] ..................... n_estimators=50, score=0.357, total=   4.3s
[CV] n_estimators=75 .................................................
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   18.7s remaining:    0.0s
[CV] ..................... n_estimators=75, score=0.149, total=   6.4s
[CV] n_estimators=75 .................................................
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   25.1s remaining:    0.0s
[CV] ..................... n_estimators=75, score=0.409, total=   5.6s
[CV] n_estimators=75 .................................................
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   30.7s remaining:    0.0s
[CV] ..................... n_estimators=75, score=0.367, total=   5.4s
[CV] n_estimators=100 ................................................
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   36.1s remaining:    0.0s
[CV] .................... n_estimators=100, score=0.152, total=   8.3s
[CV] n_estimators=100 ................................................
[CV] .................... n_estimators=100, score=0.407, total=   7.4s
[CV] n_estimators=100 ................................................
[CV] .................... n_estimators=100, score=0.368, total=   6.9s
[CV] n_estimators=125 ................................................
[CV] .................... n_estimators=125, score=0.153, total=   9.7s
[CV] n_estimators=125 ................................................
[CV] .................... n_estimators=125, score=0.415, total=   8.8s
[CV] n_estimators=125 ................................................
[CV] .................... n_estimators=125, score=0.372, total=  10.7s
[CV] n_estimators=150 ................................................
[CV] .................... n_estimators=150, score=0.152, total=  13.5s
[CV] n_estimators=150 ................................................
[CV] .................... n_estimators=150, score=0.411, total=  11.2s
[CV] n_estimators=150 ................................................
[CV] .................... n_estimators=150, score=0.366, total=  11.9s
[CV] n_estimators=175 ................................................
[CV] .................... n_estimators=175, score=0.156, total=  14.6s
[CV] n_estimators=175 ................................................
[CV] .................... n_estimators=175, score=0.411, total=  13.9s
[CV] n_estimators=175 ................................................
[CV] .................... n_estimators=175, score=0.367, total=  13.3s
[CV] n_estimators=200 ................................................
[CV] .................... n_estimators=200, score=0.151, total=  16.5s
[CV] n_estimators=200 ................................................
[CV] .................... n_estimators=200, score=0.410, total=  16.0s
[CV] n_estimators=200 ................................................
[CV] .................... n_estimators=200, score=0.371, total=  14.8s
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:  3.6min finished
Out[223]:
{'n_estimators': 125}
In [224]:
RF = RandomForestRegressor(n_estimators=125)
RF.fit(X_train,y_train)
y_train_pred = RF.predict(X_train).clip(0, 1)
In [225]:
RF.score(X_test,y_test)
Out[225]:
0.5943793965890145
In [226]:
mse=mean_squared_error(y_test,pred )
rmse = np.sqrt(mse)
print(mse)
print(rmse)
149.87141474775387
12.242198117485025
In [227]:
pd.DataFrame({'Variable':X_train.columns,
              'Importance':RF.feature_importances_}).sort_values('Importance', ascending=False)
Out[227]:
Variable Importance
4 Loudness 0.574174
0 Acousticness 0.094686
1 Danceability 0.090502
2 Energy 0.090021
3 Instrumentalness 0.082445
5 Explicit 0.068172
In [228]:
#Permutation Importance
perm = PermutationImportance(RF ,random_state=RANDOM).fit(X_test, y_test)
show_weights(perm, feature_names = list(X_test.columns))
Out[228]:
Weight Feature
0.3015 ± 0.0239 Loudness
0.1233 ± 0.0107 Acousticness
0.0888 ± 0.0163 Explicit
0.0868 ± 0.0202 Instrumentalness
0.0794 ± 0.0061 Energy
0.0390 ± 0.0059 Danceability
In [229]:
show_weights(RF, feature_names = list(X_test.columns))
Out[229]:
Weight Feature
0.5742 ± 0.0168 Loudness
0.0947 ± 0.0100 Acousticness
0.0905 ± 0.0107 Danceability
0.0900 ± 0.0106 Energy
0.0824 ± 0.0080 Instrumentalness
0.0682 ± 0.0075 Explicit
In [230]:
show_prediction(RF, X_test.iloc[1],show_feature_values=True)
Out[230]:

y (score 42.936) top features

Contribution? Feature Value
+30.515 <BIAS> 1.000
+14.481 Loudness -2.211
+4.614 Instrumentalness 0.000
+1.921 Danceability 0.671
-1.603 Energy 0.786
-2.410 Acousticness 0.343
-4.583 Explicit 0.000
In [231]:
# Plot outputs
fig, ax = plt.subplots(figsize=(6, 6))
ax = sns.scatterplot(x=y_test, y=pred)
sns.lineplot(x=y_train, y=y_train, color='black', ax=ax)
ax.set_xlabel('Y_test')
ax.set_ylabel('Y_pred')
ax.set_title('y_test vs. y_pred', fontsize=14, color='black')
plt.show()

Despite all the efforts in improving the models and the good improvements obtained so far it seems like it's still not possible for any model to predict popularity above 65. As previosuly said, artist popularity may affect popularity. Let's test this hypothesis

Network Analysis

In [232]:
df_nodes = pd.read_csv(PATH_NODES, index_col=0)
df_edgelist = pd.read_csv(PATH_EDGES, index_col=0)

NetworkX

Create list with variables for the network to hold as node data

In [233]:
node_att_names = ["Rank", "Tracks Bins", "Popularity", "Popularity Bins", "Genre", "Genre Color"]
In [234]:
def generate_networkx(df_artists, df_edgelist, node_att_names):
    # since items in df_artists are accessed over index, it needs to be resetted to ensure it does not have 
    # any missing values in the number series
    df_artists.reset_index(inplace=True, drop=True)
    G = nx.Graph()
    for counter in range(len(df_artists)):
        if df_artists.loc[counter, "Artist Name"] in set(pd.concat([df_edgelist["Artist_1"],
                                                                    df_edgelist["Artist_2"]])):
            G.add_node(df_artists.loc[counter, "Artist Name"])
            for att_name in node_att_names:
                G.nodes[df_artists.loc[counter, "Artist Name"]][att_name] = df_artists.loc[counter, att_name]

    for artist_1, artist_2, weight in zip(df_edgelist["Artist_1"], 
                                          df_edgelist["Artist_2"], 
                                          df_edgelist["Weight for Viz"]):
        if artist_1 in G.nodes() and artist_2 in G.nodes():
            G.add_edge(artist_1, artist_2, weight = weight)
    return G

The overall network is generated.

In [235]:
G = generate_networkx(df_nodes, df_edgelist, node_att_names)
In [236]:
len(df_nodes) - len(G.nodes)
Out[236]:
174

From the nodes dataframe given in, 174 are not connected in the network. They are taken out to avoid noise and to put the focus in the analysis of artists and their behaviour that show connections with other artists.

Narrowring down based on different weights for further analysis later on.


Create network with only edges with a weight more than one

In [237]:
G_two_plus = generate_networkx(df_nodes, df_edgelist[df_edgelist["Weight"]>1], node_att_names)

Create network with only edges with a weight more than five

In [238]:
G_five_plus = generate_networkx(df_nodes, df_edgelist[df_edgelist["Weight"]>5], node_att_names)

Create network with only edges with a weight more than ten

In [239]:
G_ten_plus = generate_networkx(df_nodes, df_edgelist[df_edgelist["Weight"]>9], node_att_names)
Enrich data with partition by the louvain algorithm

Assign with the louvain algorithm nodes a group, based on the network shape.

In [240]:
partition = community_louvain.best_partition(G, random_state=RANDOM)
In [241]:
nx.set_node_attributes(G, partition, "Partition")

Enrich data with centrality measures. Betweenness centrality is left out since it does not really add any value for the case at hand.

In [242]:
degree_centrality = nx.degree_centrality(G)
eigenvector_centrality = nx.eigenvector_centrality_numpy(G, weight="weight")
#betweenness_centrality = nx.betweenness_centrality(G, weight="weight")
In [243]:
nx.set_node_attributes(G, degree_centrality, "Degree Centrality")
nx.set_node_attributes(G, eigenvector_centrality, "Eigenvector Centrality")
#nx.set_node_attributes(G, betweenness_centrality, "Betweenness Centrality")

The data for the network's node are taken and placed in a dataframe.

In [244]:
df_nodes_attributes = pd.DataFrame.from_dict(dict(G.nodes(data=True)),
                                       orient="index").reset_index().rename({"index": "Artist Name"}, axis=1)

Assigning the partition colors as before done for the genres.

In [245]:
partition_colors = pd.DataFrame(df_nodes_attributes["Partition"].value_counts())
# replace the counting with index, since not needed anymore
partition_colors["Partition"] = partition_colors.index
In [246]:
len(partition_colors)
Out[246]:
13
In [247]:
partition_colors["Partition Color"] = ["#cd6155",
                                       "#566573",
                                       "#99a3a4",
                                       "#3bc14a",
                                       "#5499c7",
                                       "#057476",
                                       "#2980b9",
                                       "#bb8fce",
                                       "#dc7633",
                                       "#ffe900", 
                                       "#eb984e",
                                       "#4290f5",
                                       "#fcba03"]
In [248]:
partition_list = pd.merge(df_nodes_attributes, partition_colors, on=["Partition"], how="left")
In [249]:
partition_list.drop(columns=["Rank", 
                             "Tracks Bins", 
                             "Popularity",
                             "Popularity Bins",
                             "Genre", 
                             "Genre Color"], inplace=True)

Merge Partition df to df_artists

In [250]:
df_nodes = pd.merge(df_nodes, partition_list, on="Artist Name", how="left")

Overall network measures

In [251]:
nx.density(G)
Out[251]:
0.02235402622152244

With a share of 2.2% of all possible connections in the network, the overall network between the artists worldwide is quite loose. This is logical since there are many different genres out there and net everyone can collaborate.

In [252]:
nx.transitivity(G)
Out[252]:
0.2895544991796727

The transitivity, also called clustering coefficient, indicates how locally a network is cluster. It gives out the probability of closed triplets. With around 29.0% this is quite high compared to the overall connectedness and considering the size of the network.

In [253]:
# nx.diameter(G)

The diameter calculates longest of the shortest paths between to nodes in the network and throws an error since the generated network creates multiple unconnected networks. The biggest network could be selected to get a value, but since the measure does not work very well in regards to interprebility for the case at hand anyhow, this is not done.

In [254]:
# nx.average_shortest_path_length(G)

Also the average shortes path length throws and error for the same reasons as the diameter measure. It also is not useful in regards to interpretability and therefore not adjusted and calculated.


Most collaborative Artists in the overall network by weight

In [255]:
df_edgelist = df_edgelist.reset_index()
In [256]:
edges_count = pd.melt(df_edgelist, id_vars=['index','Weight'], value_vars=['Artist_1', 'Artist_2'], 
        var_name='Listing', value_name='Artists')

Add weight as a measure to the dataframe

In [257]:
df_nodes = pd.merge(df_nodes, edges_count.groupby('Artists')["Weight"].sum(),
                              left_on="Artist Name", right_index=True, how="left")
In [258]:
pd.DataFrame(df_nodes.groupby("Artist Name")["Weight"].sum().sort_values(ascending=False)).head()
Out[258]:
Weight
Artist Name
Gucci Mane 479.0
Rick Ross 427.0
Lil Wayne 398.0
Future 348.0
Young Thug 313.0
In [259]:
for artist in ["Gucci Mane", "Rick Ross", "Lil Wayne", "Future", "Young Thug"]:
    print(df_nodes.loc[df_nodes["Artist Name"] == artist, 
                                 ["Tracks Collabo Share", "Artist Name"]])
     Tracks Collabo Share Artist Name
142              0.548421  Gucci Mane
     Tracks Collabo Share Artist Name
364              0.829082   Rick Ross
    Tracks Collabo Share Artist Name
69              0.633609   Lil Wayne
    Tracks Collabo Share Artist Name
32              0.570058      Future
    Tracks Collabo Share Artist Name
85              0.782609  Young Thug

It can directly be noticed that especially Artists from the pop rap/rap genre tend to collaborate heavily for their songs. This is also the case in the overall output if their songs. The range of the top five artist is between around 55% up to 83%.


Most important artists by their centrality measures.

Degree Centrality:

In [260]:
df_nodes.loc[:, ["Artist Name", "Degree Centrality"]].sort_values(by="Degree Centrality", 
                                                                            ascending=False).head(5)
Out[260]:
Artist Name Degree Centrality
69 Lil Wayne 0.145367
32 Future 0.140575
122 Ty Dolla $ign 0.138978
34 Nicki Minaj 0.137380
142 Gucci Mane 0.135783

Eigenvector Centrality:

In [261]:
df_nodes.loc[:, ["Artist Name", "Eigenvector Centrality"]].sort_values(by="Eigenvector Centrality", 
                                                                            ascending=False).head(5)
Out[261]:
Artist Name Eigenvector Centrality
142 Gucci Mane 0.335215
364 Rick Ross 0.320372
69 Lil Wayne 0.279134
32 Future 0.247427
99 DJ Khaled 0.241401

Also the two measures degree centrality and eigenvector centrality support the finding of the weights that the network is highly characterized by the most famous pop rappers and rappers.

Partition analysis

Now the network is looked at from the perspective of the partition, which is the result from the louvain algorithm. First the network is visualized with focus of the partition feature.

Network visualizations

In [262]:
nx.draw_kamada_kawai(G, 
               node_color=list(partition.values()))

Since the visualization is static and too packed, its not optimal for gaining insights. In the following, more advanced visualization techniques are used.


In [263]:
# Setting the default figure size a bit larger
defaults = dict(width=750, height=750, padding=0.1,
                xaxis=None, yaxis=None)
hv.opts.defaults(
    opts.EdgePaths(**defaults), opts.Graph(**defaults), opts.Nodes(**defaults))
In [264]:
graph = hv.Graph.from_networkx(G, nx.layout.fruchterman_reingold_layout).opts(
                                                                        tools=["hover"],
                                                                        edge_alpha=0.2,
                                                                        node_color="Partition", 
                                                                        #node_size="Centrality Degree",
                                                                        cmap="Set1",
                                                                        legend_position="right"
                                                                        )

show(hv.render((graph)))

Holoviews offers better solution since it is dynamic and allows to zoom in. Another option which also offers non-static visualizations is pyvis.


In [265]:
def create_pyvis_graph_partition(df_artists, df_edgelist, height="1500px", width="100%", bgcolor="#222222", 
                        font_color="#1E1C1E", algo="barnes", notebook=True, 
                        buttons_YN=True, edge_width_YN=True, heading="", file_name="pyvis_graph.html"):
    """
    Creates a pyvis visualization of a network colored by partition, edge width by weight and node size by 
    degree centrality.
    
    Input:
    - df_artists: dataframe holding the relevant artist data
    - df_edgelist: dataframe holding the relevant edges  
    - height (str): defines height of the graph
    - bgcolor (str): defines of the background (HTML color code can be used or simple color naming)
    - fontcolor (str): defines the font color (HTML color code can be used or simple color naming)
    - algo (str): the options "barnes", "forced" and "hr" are possible options for the algorithm to set up graph
    - notebook (boolean): determines if graph is saved as seperate HTML file or shown within the notebook
    - buttons_YN (boolean): adds a bar with design setting options on the side
    - edge_with_YN (boolean): adjusts the width of the edges based on the weight if turned on
    - heading (str): adds a header to the graph
    
    Output:
    pyvis graph
    """
    # Set up network
    g = Network(height=height, width=width, 
                bgcolor=bgcolor, font_color=font_color,
                notebook=notebook, heading=heading)

    # Set width of graph depending if setting should be shown on the side, only if not displayed in notebook
    # since there results in issues
    if notebook==True:
        if buttons_YN == True:
            g.show_buttons()
        else:
            g.width = "100%"
    else:
        if buttons_YN == True:
            g.width = "70%"
            g.show_buttons(["physics"])
        else:
            g.width = "100%"
    # Get unique list of artists to create nodes
    for artist, value, genre, color in zip(df_artists["Artist Name"], 
                                           df_artists["Degree Centrality"], 
                                           df_artists["Partition"], 
                                           df_artists["Partition Color"]):
        if artist in set(pd.concat([df_edgelist["Artist_1"], df_edgelist["Artist_2"]])):
            g.add_node(artist, 
                       value=value, 
                       title= f"Artist: {artist} | Partition: {genre}", 
                       color=color,
                       partition=partition)
    # Create edges with all artist pairs
    if edge_width_YN == True:
        for artist_1, artist_2, weight in zip(df_edgelist["Artist_1"], 
                                              df_edgelist["Artist_2"], 
                                              df_edgelist["Weight"]):
            g.add_edge(artist_1, artist_2, weight=weight, width=weight, title=f"Weight: {weight}")
    else:
        for artist_1, artist_2, weight in zip(df_edgelist["Artist_1"], 
                                              df_edgelist["Artist_2"], 
                                              df_edgelist["Weight"]):
            g.add_edge(artist_1, artist_2, weight=weight, title=f"Weight: {weight}")

    # Set algorithm
    if algo == "barnes":
        g.barnes_hut()
    elif algo == "forced":
        g.force_atlas_2based()
    elif algo == "hr":
        g.hrepulsion()
    return g.show(file_name)
In [266]:
create_pyvis_graph_partition(df_nodes, df_edgelist, 
                   height="800px", width="100%", bgcolor="#FDFEFE", font_color="#1E1C1E", algo="barnes",
                   notebook=True, buttons_YN=False, edge_width_YN=True, file_name="partition_graph.html")
Out[266]:
In [267]:
# Create the CircosPlot object: c
plt.rcParams['axes.facecolor'] = 'white'
c_partition = CircosPlot(G, 
               node_grouping="Partition",
               node_order="Partition",
               node_color="Partition", 
               edge_width="weight",
               node_label_layout="rotation",
               group_label_position="middle",
               group_label_offset=2,
               figsize=(15,15))
# Draw c to the screen
c_partition.draw()
# Display the plot
plt.show()

Calculations

First, the groups generated by louvain algorithm are investigated and tried to make sense of by applying some EDA to it.

In [268]:
df_nodes["Partition"].value_counts()
Out[268]:
4.0     163
0.0      99
6.0      82
3.0      80
7.0      66
5.0      55
2.0      29
1.0      23
8.0      20
11.0      3
9.0       3
12.0      2
10.0      2
Name: Partition, dtype: int64

Noticeable is that some groups are very small and cannot really be interpreted due to that.

In [269]:
def partition_composition(nodes, criterion, n=1):
    """
    Gives out a dataframe which groups the different groups from the partition and gives out the top n shares
    by the criterion of each group out.
    """
    return pd.DataFrame(pd.DataFrame(nodes.groupby(["Partition"])[criterion].\
        value_counts(normalize=True, sort=True,ascending=False)).\
        groupby(level="Partition")[criterion].nlargest(n).reset_index(level=0, drop=True))
In [270]:
partition_composition(df_nodes, "Genre")
Out[270]:
Genre
Partition Genre
0.0 pop 0.474747
1.0 pop 0.608696
2.0 pop 0.379310
3.0 latin 0.737500
4.0 pop 0.680982
5.0 pop rap 0.490909
6.0 pop 0.500000
7.0 country 0.424242
8.0 german hip hop 0.800000
9.0 pop 1.000000
10.0 pop rap 1.000000
11.0 pop 0.666667
12.0 pop 1.000000

Interpeting the different groups generated by the lauvoin algorithm in the perspective of genres shows that it did not detect too many different genres and is mainly dominated by pop.

In [271]:
partition_composition(df_nodes, "Tracks Bins")
Out[271]:
Tracks Bins
Partition Tracks Bins
0.0 4: Up to 200 0.313131
1.0 4: Up to 200 0.521739
2.0 4: Up to 200 0.344828
3.0 2: Up to 50 0.262500
4.0 3: Up to 100 0.294479
5.0 4: Up to 200 0.363636
6.0 4: Up to 200 0.317073
7.0 4: Up to 200 0.333333
8.0 4: Up to 200 0.250000
9.0 2: Up to 50 0.666667
10.0 2: Up to 50 0.500000
11.0 1: Up to 25 1.000000
12.0 3: Up to 100 0.500000
In [272]:
partition_composition(df_nodes, "Popularity Bins")
Out[272]:
Popularity Bins
Partition Popularity Bins
0.0 2: Popular 0.737374
1.0 2: Popular 0.826087
2.0 2: Popular 0.655172
3.0 2: Popular 0.700000
4.0 2: Popular 0.699387
5.0 2: Popular 0.490909
6.0 2: Popular 0.829268
7.0 2: Popular 0.833333
8.0 2: Popular 0.850000
9.0 2: Popular 1.000000
10.0 2: Popular 1.000000
11.0 2: Popular 1.000000
12.0 3: Less Popular 1.000000

Not a single group was created that holds the very popular artists. This seems reasonable since it seems likely that they very connected and makes it more difficult for the algorithm to delimit. This hypothesis has be investigated further later on.


In [273]:
def artist_weight_grouping_top_n(nodes, criterion, n = 5):
    """
    Gives out a dataframe with the top n weight of artists grouped by the criterion given in.
    """
    return pd.DataFrame(pd.merge(nodes.loc[:,["Artist Name", criterion]], 
         pd.DataFrame(edges_count.groupby('Artists')["Weight"].sum().sort_values(ascending=False)),
         left_on="Artist Name", right_index=True, how="left").\
         groupby([criterion, "Artist Name"]).max().\
         groupby(level=criterion)["Weight"].nlargest(n).reset_index(level=0, drop=True))
In [274]:
def artist_cent_grouping_top_n(nodes, criterion, cent_type, n=5):
    """
    Gives out a dataframe with the top n artists based on a centrality type cent_type and grouping criterion
    given into the functionh.
    """
    return pd.DataFrame(pd.DataFrame(nodes.groupby([criterion, "Artist Name"])[cent_type].mean()).\
        groupby(level=criterion)[cent_type].nlargest(n).reset_index(level=0, drop=True))
In [275]:
artist_weight_grouping_top_n(df_nodes, "Partition", 3)
Out[275]:
Weight
Partition Artist Name
0.0 Rick Ross 427.0
Lil Wayne 398.0
DJ Khaled 305.0
1.0 Eminem 117.0
Timbaland 80.0
Dr. Dre 77.0
2.0 A$AP Rocky 145.0
Kendrick Lamar 121.0
ScHoolboy Q 94.0
3.0 Farruko 121.0
Arcangel 117.0
J Balvin 114.0
4.0 Major Lazer 150.0
Steve Aoki 150.0
Diplo 112.0
5.0 Gucci Mane 479.0
Future 348.0
Young Thug 313.0
6.0 Pitbull 127.0
Sean Paul 62.0
Lil Jon 59.0
7.0 Willie Nelson 30.0
Eric Clapton 23.0
Dolly Parton 18.0
8.0 Bonez MC 151.0
RAF Camora 129.0
KC Rebell 89.0
9.0 gnash 2.0
EDEN 1.0
Olivia O'Brien 1.0
10.0 Yung Gravy 21.0
bbno$ 17.0
11.0 Loud Luxury 10.0
Brando 8.0
Bryce Vine 3.0
12.0 Håkan Hellström 3.0
Veronica Maggio 3.0
In [276]:
artist_cent_grouping_top_n(df_nodes, "Partition", "Degree Centrality", 3)
Out[276]:
Degree Centrality
Partition Artist Name
0.0 Lil Wayne 0.145367
Ty Dolla $ign 0.138978
Nicki Minaj 0.137380
1.0 Eminem 0.062300
Timbaland 0.057508
Missy Elliott 0.054313
2.0 A$AP Rocky 0.084665
Kendrick Lamar 0.078275
ScHoolboy Q 0.067093
3.0 J Balvin 0.078275
Wisin 0.068690
Yandel 0.068690
4.0 Steve Aoki 0.075080
David Guetta 0.067093
Diplo 0.063898
5.0 Future 0.140575
Gucci Mane 0.135783
Quavo 0.123003
6.0 Pitbull 0.079872
Sean Paul 0.071885
Flo Rida 0.046326
7.0 Willie Nelson 0.020767
Kenny Chesney 0.019169
Miranda Lambert 0.019169
8.0 Ufo361 0.022364
Capital Bra 0.020767
GoldLink 0.012780
9.0 gnash 0.003195
EDEN 0.001597
Olivia O'Brien 0.001597
10.0 Yung Gravy 0.007987
bbno$ 0.003195
11.0 Bryce Vine 0.003195
Loud Luxury 0.003195
Brando 0.001597
12.0 Håkan Hellström 0.001597
Veronica Maggio 0.001597
In [277]:
artist_cent_grouping_top_n(df_nodes, "Partition", "Eigenvector Centrality", 3)
Out[277]:
Eigenvector Centrality
Partition Artist Name
0.0 Rick Ross 3.203724e-01
Lil Wayne 2.791344e-01
DJ Khaled 2.414005e-01
1.0 Eminem 2.677969e-02
Dr. Dre 2.110824e-02
50 Cent 1.706246e-02
2.0 Kendrick Lamar 5.087617e-02
A$AP Rocky 4.010527e-02
ScHoolboy Q 3.484246e-02
3.0 Yellow Claw 1.670624e-02
DJ Snake 1.577184e-02
J Balvin 1.201122e-02
4.0 Major Lazer 3.893297e-02
Ne-Yo 2.538158e-02
Calvin Harris 1.727967e-02
5.0 Gucci Mane 3.352150e-01
Future 2.474266e-01
Young Thug 2.321600e-01
6.0 Lil Jon 1.793786e-02
Flo Rida 1.791843e-02
Pitbull 1.733024e-02
7.0 Michael Jackson 2.566493e-03
Thomas Rhett 1.273664e-03
Willie Nelson 8.309593e-04
8.0 Ufo361 3.710205e-03
Kollegah 8.227255e-04
GoldLink 6.890698e-04
9.0 Olivia O'Brien 1.556862e-18
EDEN 1.362681e-18
gnash -1.804212e-18
10.0 Yung Gravy 1.653809e-03
bbno$ 1.381231e-04
11.0 Bryce Vine 4.517102e-04
Loud Luxury 4.668263e-06
Brando 1.926506e-07
12.0 Veronica Maggio 2.106835e-19
Håkan Hellström -4.429116e-19

In [278]:
def dens_and_trans_calc(df_artists, df_edgelist, group_criterion):
    table = PrettyTable(["Group","Density", "Transitivity"])
    for criterion in df_artists[group_criterion].unique():
        # since there are some artists without a Partition value assigned, they are not shown in the output table
        # with the str and nan the possible cases are covered
        if isinstance(criterion,str):
            G_temp = generate_networkx(df_artists[df_artists[group_criterion] == criterion], 
                           df_edgelist,
                           [group_criterion])
            table.add_row([criterion, round(nx.density(G_temp),4), round(nx.transitivity(G_temp),4)])
        elif math.isnan(criterion) == False:
            G_temp = generate_networkx(df_artists[df_artists[group_criterion] == criterion], 
                                       df_edgelist,
                                       [group_criterion])
            table.add_row([criterion, round(nx.density(G_temp),4), round(nx.transitivity(G_temp),4)])
    print(table)
In [279]:
dens_and_trans_calc(df_nodes, df_edgelist, "Partition")
+-------+---------+--------------+
| Group | Density | Transitivity |
+-------+---------+--------------+
|  0.0  |  0.1612 |    0.4899    |
|  1.0  |   0.17  |    0.3578    |
|  2.0  |  0.1527 |    0.354     |
|  3.0  |  0.1595 |    0.4455    |
|  4.0  |  0.0275 |    0.1084    |
|  5.0  |  0.2498 |    0.4672    |
|  6.0  |  0.0515 |    0.2768    |
|  7.0  |  0.0457 |    0.253     |
|  8.0  |  0.2632 |     0.52     |
|  9.0  |  0.6667 |      0       |
|  10.0 |   1.0   |      0       |
|  11.0 |  0.6667 |      0       |
|  12.0 |   1.0   |      0       |
+-------+---------+--------------+

Community 7 observed by itself is the strongest connected network. It has over a quarter of all possible connections and over half of all possible triangle connections.


In [280]:
print(nx.attribute_assortativity_coefficient(G, "Partition"))
0.45849023328564054

The assortivity measure shows the similiartiy of connections in regard to the given attribute. With 0.47 this value is quite high, compared to the ones calculated later on, which is logical since this was the goal of the algorithm to detect similarity within communities. However, these generated communities are difficult for humans to interpret and to draw conclusions from.

Genre analysis

Network Analysis

In [281]:
def create_pyvis_graph_genre(df_artists, df_edgelist, height="1500px", width="100%", bgcolor="#222222", 
                        font_color="#1E1C1E", algo="barnes", notebook=True, 
                        buttons_YN=True, edge_width_YN=True, heading="", file_name="pyvis_graph.html"):
    """
    Creates a pyvis visualization of a network colored by genre, edge width by weight and node size by 
    degree centrality.
    
    Input:
    - df_artists: dataframe holding the relevant artist data
    - df_edgelist: dataframe holding the relevant edges  
    - height (str): defines height of the graph
    - bgcolor (str): defines of the background (HTML color code can be used or simple color naming)
    - fontcolor (str): defines the font color (HTML color code can be used or simple color naming)
    - algo (str): the options "barnes", "forced" and "hr" are possible options for the algorithm to set up graph
    - notebook (boolean): determines if graph is saved as seperate HTML file or shown within the notebook
    - buttons_YN (boolean): adds a bar with design setting options on the side
    - edge_with_YN (boolean): adjusts the width of the edges based on the weight if turned on
    - heading (str): adds a header to the graph
    
    Output:
    pyvis graph
    """
    # Set up network
    g = Network(height=height, width=width, 
                bgcolor=bgcolor, font_color=font_color,
                notebook=notebook, heading=heading)

    # Set width of graph depending if setting should be shown on the side, only if not displayed in notebook
    # since there results in issues
    if notebook==True:
        if buttons_YN == True:
            g.show_buttons()
        else:
            g.width = "100%"
    else:
        if buttons_YN == True:
            g.width = "70%"
            g.show_buttons(["physics"])
        else:
            g.width = "100%"
    # Get unique list of artists to create nodes
    for artist, value, genre, color in zip(df_artists["Artist Name"], 
                                           df_artists["Degree Centrality"], 
                                           df_artists["Genre"], 
                                           df_artists["Genre Color"]):
        if artist in set(pd.concat([df_edgelist["Artist_1"], df_edgelist["Artist_2"]])):
            g.add_node(artist, 
                       value=value, 
                       title= f"Artist: {artist} | Genre: {genre}", 
                       color=color,
                       partition=partition)
    # Create edges with all artist pairs
    if edge_width_YN == True:
        for artist_1, artist_2, weight in zip(df_edgelist["Artist_1"], 
                                              df_edgelist["Artist_2"], 
                                              df_edgelist["Weight"]):
            g.add_edge(artist_1, artist_2, weight=weight, width=weight, title=f"Weight: {weight}")
    else:
        for artist_1, artist_2, weight in zip(df_edgelist["Artist_1"], 
                                              df_edgelist["Artist_2"], 
                                              df_edgelist["Weight"]):
            g.add_edge(artist_1, artist_2, weight=weight, title=f"Weight: {weight}")

    # Set algorithm
    if algo == "barnes":
        g.barnes_hut()
    elif algo == "forced":
        g.force_atlas_2based()
    elif algo == "hr":
        g.hrepulsion()
    return g.show(file_name)

In [282]:
create_pyvis_graph_genre(df_nodes, df_edgelist, height="800px", width="100%", bgcolor="#FDFEFE", font_color="#1E1C1E", algo="barnes",
                       notebook=True, buttons_YN=False, edge_width_YN=True, file_name="genre_graph.html")
Out[282]:

Latin and german hip hop is highly connected with itself, so they prefer collaborationg with people within their genre most. Pop looks like the biggest genre in within the network, with quite some important artist as main collaborators, also for artists of other genres. However, the most connected network seems to be pop rap with many connections to artists of other genres, but also very stronlgy connected in itself. Also their high rate of people with artists with high degree centrality scores is noticable, such as Wiz Khalifa, Lil Wayne, Big Sean and Gucci Mane.


Filtering further down by amount of connections to make it less messy $\rightarrow$ five or more collaborations

In [283]:
len(df_edgelist[df_edgelist["Weight"]>4])
Out[283]:
436
In [284]:
create_pyvis_graph_genre(df_nodes, df_edgelist[df_edgelist["Weight"]>4], 
                   height="800px", width="100%", bgcolor="#FDFEFE", font_color="#1E1C1E", algo="barnes",
                   notebook=True, buttons_YN=False, edge_width_YN=True, file_name="genre_graph_five_plus.html")
Out[284]:

Pop rap, latin and german hip hop still pop out as strongly connected in this network. For german hip hop the heavy collaborations between RAF Camora and Bonez Mc + KC Rebell and Summer Cem are notiable. In the latin genre the interconnectedness of the four artists Sech, Dalex, Justin Quiles and Lenny Tavarez is special.


Filtering further down by amount of connections to make it less messy $\rightarrow$ ten or more collaborations

In [285]:
len(df_edgelist[df_edgelist["Weight"]>9])
Out[285]:
138
In [286]:
create_pyvis_graph_genre(df_nodes, df_edgelist[df_edgelist["Weight"]>9], 
                   height="800px", width="100%", bgcolor="#FDFEFE", font_color="#1E1C1E", algo="barnes",
                   notebook=True, buttons_YN=False, edge_width_YN=True, file_name="genre_graph_ten_plus.html")
Out[286]:

The heaviest collaborators in the overall network can be noticed well in this last pyvis visualization. Gucci Mane, Rick Ross and Lil Wayne are highly connected artists. DJ Khaled, who falls under the pop genre, holds strong connections to man of the top pop rap artists.

In [287]:
# Create the CircosPlot object: c
plt.rcParams['axes.facecolor'] = 'white'
c_genre = CircosPlot(G, 
               node_grouping="Genre",
               node_order="Genre",
               node_color="Genre", 
               edge_width="weight",
               node_label_layout="rotation",
               group_label_position="middle",
               group_label_offset=2,
               figsize=(15,15))
# Draw c to the screen
c_genre.draw()
# Display the plot
plt.show()

The circos plot shows as the interactive pyvis visualization before the strong collaboration of artists of pop rap and rap, but also between these these two. However, the circos plot is very handy to also analyse the groups with very little collaborations. Contemporary country, alternative metal, rock and modern rock stand out. Since in the genre rock often the artists are not individuals, but bands, this comes as no suprise. It makes collaborations with even more people very difficult. For contemporary country a possible explanation is that it is a very specific genre that does not work to well with other mainstream genres. Pop on the other works quite well for collaborations, especially with pop rap. Here however, only some of the huge group of artists grew into popular cross-genre collaboratators. As the next graph shows among them are Christ Brown, Nicki Minaj DJ Khaled or T-Pain.


In the following the most influencial and collaborative artists can be observed individually. The same names as mentioned before clearly stand out.

In [288]:
# Create the CircosPlot object: c
plt.rcParams['axes.facecolor'] = 'white'
c_genre = CircosPlot(G_five_plus, 
               node_grouping="Genre",
               node_color="Genre", 
               node_labels=True, 
               node_label_layout="rotation",
               edge_width="weight",
               group_label_position="middle",
               group_label_offset=14,
               figsize=(15,15))
c_genre.draw_group_labels()
# Draw c to the screen
c_genre.draw()

# Display the plot
plt.show()
In [289]:
# Create the CircosPlot object: c
plt.rcParams['axes.facecolor'] = 'white'
c_genre = CircosPlot(G_ten_plus, 
               node_grouping="Genre",
               node_color="Genre", 
               node_labels=True, 
               node_label_layout="rotation",
               edge_width="weight",
               group_label_position="middle",
               group_label_offset=10,
               figsize=(15,15))
c_genre.draw_group_labels()
# Draw c to the screen
c_genre.draw()

# Display the plot
plt.show()

Calculations

The table shows the highest connected artists based on the weight for each genre.

In [290]:
artist_weight_grouping_top_n(df_nodes, "Genre", 3)
Out[290]:
Weight
Genre Artist Name
alternative metal Linkin Park 8.0
Marilyn Manson 6.0
Limp Bizkit 3.0
country Willie Nelson 30.0
Dolly Parton 18.0
Jason Aldean 18.0
german hip hop Bonez MC 151.0
RAF Camora 129.0
KC Rebell 89.0
latin Farruko 121.0
Arcangel 117.0
J Balvin 114.0
pop DJ Khaled 305.0
Chris Brown 255.0
Nicki Minaj 250.0
pop dance T.I. 227.0
Tyga 191.0
Steve Aoki 150.0
pop rap Gucci Mane 479.0
Rick Ross 427.0
Lil Wayne 398.0
rap Snoop Dogg 234.0
Travis Scott 185.0
Kanye West 179.0
rock Eric Clapton 23.0
Fall Out Boy 16.0
Santana 16.0

The table shows the highest connected artists based on the degree centrality for each genre.

In [291]:
artist_cent_grouping_top_n(df_nodes, "Genre", "Degree Centrality", 3)
Out[291]:
Degree Centrality
Genre Artist Name
alternative metal Marilyn Manson 0.007987
Linkin Park 0.006390
Five Finger Death Punch 0.003195
country Willie Nelson 0.020767
Kenny Chesney 0.019169
Miranda Lambert 0.019169
german hip hop Ufo361 0.022364
Capital Bra 0.020767
Kontra K 0.012780
latin J Balvin 0.078275
Wisin 0.068690
Yandel 0.068690
pop Ty Dolla $ign 0.138978
Nicki Minaj 0.137380
Chris Brown 0.123003
pop dance T.I. 0.107029
Tyga 0.097444
Ludacris 0.084665
pop rap Lil Wayne 0.145367
Future 0.140575
Gucci Mane 0.135783
rap Snoop Dogg 0.123003
Travis Scott 0.091054
Kendrick Lamar 0.078275
rock Fall Out Boy 0.017572
Santana 0.015974
Eric Clapton 0.011182

The table shows the highest connected artists based on the eigenvector centrality for each genre.

In [292]:
artist_cent_grouping_top_n(df_nodes, "Genre", "Eigenvector Centrality", 3)
Out[292]:
Eigenvector Centrality
Genre Artist Name
alternative metal Linkin Park 0.001242
Marilyn Manson 0.000297
Limp Bizkit 0.000160
country Thomas Rhett 0.001274
Willie Nelson 0.000831
Brantley Gilbert 0.000756
german hip hop Ufo361 0.003710
Kollegah 0.000823
RAF Camora 0.000256
latin J Balvin 0.012011
Bad Bunny 0.006450
Anuel AA 0.005351
pop DJ Khaled 0.241401
Nicki Minaj 0.173952
Chris Brown 0.158421
pop dance T.I. 0.144493
Tyga 0.112114
Ludacris 0.063987
pop rap Gucci Mane 0.335215
Rick Ross 0.320372
Lil Wayne 0.279134
rap Kanye West 0.108104
Travis Scott 0.090836
Snoop Dogg 0.078985
rock Fall Out Boy 0.003003
AJR 0.001714
Lenny Kravitz 0.001045

In [293]:
dens_and_trans_calc(df_nodes, df_edgelist, "Genre")
+-------------------+---------+--------------+
|       Group       | Density | Transitivity |
+-------------------+---------+--------------+
|      pop rap      |  0.1813 |    0.4945    |
|        pop        |  0.0237 |    0.152     |
|        rap        |  0.1354 |    0.3082    |
|       latin       |  0.1301 |    0.4595    |
|        rock       |  0.0186 |      0       |
| alternative metal |  0.0667 |      0       |
|     pop dance     |  0.0442 |    0.2233    |
|      country      |  0.1129 |    0.3529    |
|   german hip hop  |  0.3833 |    0.5612    |
+-------------------+---------+--------------+

In [294]:
print(nx.attribute_assortativity_coefficient(G, "Genre"))
0.33397887904916385

Artists have a much higher tendency to collaborate with artists within their genre than a random genre.

Other Analysis

Network Visualization

The following circos plot shows the artists ranked starting from the left going around anticlockwise.

In [295]:
# Create the CircosPlot object: c
plt.rcParams['axes.facecolor'] = 'white'
c_rank = CircosPlot(G, 
               node_order="Rank",
               node_labels=True,
               node_label_layout="rotation",
               edge_width="weight",
               figsize=(20,20),
               fontsize=8)
# Draw c to the screen
c_rank.figure.tight_layout()
c_rank.draw()
# Display the plot
plt.show()

The graph shows that the artists ranked higher generally have stronger network and produce more tracks in collaboration than the lower rank artists.

In [296]:
# Create the CircosPlot object: c
plt.rcParams['axes.facecolor'] = 'white'
c_popularity = CircosPlot(G, 
               node_grouping="Popularity Bins",
               node_color="Popularity Bins", 
               node_labels=False, 
               node_label_layout="rotation",
               edge_width="weight",
               group_label_position="middle",
               group_label_offset=10,
               figsize=(15,15))
# Draw c to the screen
c_popularity.figure.tight_layout()
c_popularity.draw_group_labels()
c_popularity.draw()
# Display the plot
plt.show()

Since the higher ranked artists are also the more popular ones the graphs show very similar results. If artists are more popular they tend to have more collaborations and seem to be also to be more with artists which move in the some area of popularity.

In [297]:
# Create the CircosPlot object: c
plt.rcParams['axes.facecolor'] = 'white'
c_tracks = CircosPlot(G, 
               node_grouping="Tracks Bins",
               node_color="Tracks Bins",
               node_labels=False, 
               node_label_layout="rotation",
               edge_width="weight",
               group_label_position="middle",
               group_label_offset=10,
               figsize=(15,15))
# Draw c to the screen
c_tracks.figure.tight_layout()
c_tracks.draw()
# Display the plot
plt.show()

The last graph supports what common sense would suggest. People who produce overall more songs also produce in absolute number more songs in collaborations. More experienced artists seem to chose to collaborate with artists who also have already produced many songs.

Calculations

In [298]:
artist_weight_grouping_top_n(df_nodes, "Tracks Bins", 3)
Out[298]:
Weight
Tracks Bins Artist Name
1: Up to 25 Lenny Tavárez 58.0
Nio Garcia 31.0
Casper Magico 27.0
2: Up to 50 Dalex 63.0
A$AP Mob 53.0
Darell 53.0
3: Up to 100 Mike WiLL Made-It 87.0
Mustard 83.0
Sech 82.0
4: Up to 200 DJ Khaled 305.0
Quavo 256.0
Travis Scott 185.0
5: Up to 300 Nicki Minaj 250.0
Ty Dolla $ign 238.0
Big Sean 226.0
6: Up to 500 Rick Ross 427.0
Young Thug 313.0
2 Chainz 282.0
7: Over 500 Gucci Mane 479.0
Lil Wayne 398.0
Future 348.0
In [299]:
artist_weight_grouping_top_n(df_nodes, "Popularity Bins", 3)
Out[299]:
Weight
Popularity Bins Artist Name
1: Very Popular Lil Wayne 398.0
Future 348.0
Young Thug 313.0
2: Popular Gucci Mane 479.0
Rick Ross 427.0
2 Chainz 282.0
3: Less Popular A$AP Mob 53.0
Yellow Claw 50.0
Tinie Tempah 38.0
In [300]:
artist_cent_grouping_top_n(df_nodes, "Popularity Bins", "Degree Centrality", 3)
Out[300]:
Degree Centrality
Popularity Bins Artist Name
1: Very Popular Lil Wayne 0.145367
Future 0.140575
Ty Dolla $ign 0.138978
2: Popular Gucci Mane 0.135783
2 Chainz 0.119808
French Montana 0.111821
3: Less Popular Tinie Tempah 0.031949
Robin Thicke 0.030351
A$AP Mob 0.028754
In [301]:
print(nx.attribute_assortativity_coefficient(G, "Popularity Bins"))
0.07543296938654816

The number supports the visual observation that artists within the same range of popularity have a slightly higher tendency to collaborate.

In [302]:
print(nx.attribute_assortativity_coefficient(G, "Tracks Bins"))
0.03563773098467429

The number supports the visual observation that artists within the same range of overall produced songs have a slightly higher tendency to collaborate.


Bringing all this together: We want a function which gives us the hottes artists of a genre based on the preferred collabroation measure. By having them figured out in the next step we try to figure out how we can reach these artists.

In [303]:
def top_n_artists_in_genre(df, genre, collab_measure, n):
    """
    The function gives out the top n artists by the desired genre and overall collaboration measure.
    
    Input:
    - df (dataframe): artist dataframe
    - genre (str): the genre to filter on
    - collab_measure (str): the desired collaboration measure of an artist ("Eigenvector Centrality", 
    "Degree Centrality" or "Weight")
    
    Output:
    - df (dataframe): as determined on top
    """
    return pd.DataFrame(df[df["Genre"] == genre].loc[:, ["Artist Name", collab_measure]].\
                        sort_values(by=[collab_measure], ascending=False))[:n]
In [411]:
top_n_artists_in_genre(df_nodes, "german hip hop", "Degree Centrality", 5)
Out[411]:
Artist Name Degree Centrality
408 Ufo361 0.022364
215 Capital Bra 0.020767
261 RAF Camora 0.012780
482 Kontra K 0.012780
511 Sido 0.011182

Scenario 1: If we know which artist we want to get our hands on we possible already know someone in the music industry. In this case we want to see over which chain we can reach out to our desired artist for collaboration. The assumption is that there is a higher chance to know someone when they are less famous. Second, if the connection between the artists is stronger it promotes the probability to be helpful.

In [305]:
def get_in_contact_with_connection(G, df_artist, df_edgelist, source, target, n):
    """
    Gives out a possible chains of artists how to reach artist for possible collaboration with 
    one artist know as starting point. The sum of the ranks of all artists is calculated 
    and returned in an descending order.
    
    Input:
    - G (network): network to analyze
    - df_artist (dataframe): dataframe that holds the information of the artists
    - source (str): artist that is known
    - target (str): artist that want to be reached out to
    - n (int): amount of chains shown
    
    Output:
    - df (dataframe): holds n chains of connections to reach the desired artist.
    """
    # calculate all shortest paths from source to target
    df = pd.DataFrame(nx.all_shortest_paths(G, source, target))
    # lists to place sums of rank and weight of each chain in
    sum_list_rank = []
    sum_list_weight = []
    for index, row in df.iterrows():
        # lists to place temporary values of rank and weight for each chain in, to calculate the sum at the end
        temp_list_rank = []
        temp_list_weight = []
        for artist_iter in range(len(row)-1):
            # create df subset with all rows where first artist is in target or source
            subset = df_edgelist[(df_edgelist["Artist_1"] == row[artist_iter]) |\
                                 (df_edgelist["Artist_2"] == row[artist_iter])]
            # get the row of the pair by filtering down to one row from the subset by searching for second artist
            temp_list_weight.append(subset[(subset["Artist_1"] == row[artist_iter+1]) |\
                   (subset["Artist_2"] == row[artist_iter+1])].iloc[0]["Weight"])
            # take weight and add to list
        sum_list_weight.append(sum(temp_list_weight))
        # same idea of calculating the sum, but simpler loop since only one artist name required
        for artist in row:
            temp_list_rank.append(df_artists[df_artists["Artist Name"] == artist].iloc[0]["Rank"])
        sum_list_rank.append(sum(temp_list_rank))
    # adding the weight and rank to the df
    df["Sorting Criterion Weight Sum"] = sum_list_weight
    df["Sorting Criterion Rank Sum"] = sum_list_rank
    # sorting the df by rank 
    df.sort_values(by=["Sorting Criterion Rank Sum"], ascending=False, inplace=True)
    df.reset_index(inplace=True, drop=True)
    # return the n first rows of the df
    return df[:n]
In [410]:
get_in_contact_with_connection(G, df_nodes, df_edgelist, "Kollegah", "Ufo361", 5)
Out[410]:
0 1 2 Sorting Criterion Weight Sum Sorting Criterion Rank Sum
0 Kollegah KC Rebell Ufo361 7 1880
1 Kollegah RAF Camora Ufo361 8 1254

Scenario 2: We know which artist we want to get our hands on and don't know anyone music industry. In this case we determine how many degrees away we are willing to start to get in contact with the artist and then to see over which chains we can reach out to our desired artist for collaboration. The assumption is that there is a higher chance to know someone when they are less famous. Second, if the connection between the artists is stronger it promotes the probability to be helpful.

In [307]:
def get_in_contact_without_connection(G, df_artists, df_edgelist, target, degree, n):
    """
    Gives out a possible chains of artists how to reach artist for possible collaboration with no artist know
    as starting point. A degree is given in to determine how far away we are willing to start.
    The sum of the ranks of all artists is calculated and returned in an descending order.
    
    Input:
    - G (network): network to analyze
    - df_artist (dataframe): dataframe that holds the information of the artists
    - target (str): artist that want to be reached out to
    - degree (int): how many degrees away are we willing to start reaching out
    - n (int): amount of chains shown
    
    Output:
    - df (dataframe): holds n chains of connections to reach the desired artist.
    """
    degree_artists = [key for (key, value) in nx.single_source_shortest_path_length(G, target).items()\
                      if value == degree]
    # create empty df to fill with all possible connections
    df = pd.DataFrame(nx.all_shortest_paths(G, degree_artists[0], target))
    df = df[:0]
    for i in degree_artists:
        df_temp = pd.DataFrame(nx.all_shortest_paths(G, i, target))
        df = df.append(df_temp)
    # lists to place sums of rank and weight of each chain in
    sum_list_rank = []
    sum_list_weight = []
    for index, row in df.iterrows():
        # lists to place temporary values of rank and weight for each chain in, to calculate the sum at the end
        temp_list_rank = []
        temp_list_weight = []
        for artist_iter in range(len(row)-1):
            # create df subset with all rows where first artist is in target or source
            subset = df_edgelist[(df_edgelist["Artist_1"] == row[artist_iter]) |\
                                 (df_edgelist["Artist_2"] == row[artist_iter])]
            # get the row of the pair by filtering down to one row from the subset by searching for second artist
            temp_list_weight.append(subset[(subset["Artist_1"] == row[artist_iter+1]) |\
                   (subset["Artist_2"] == row[artist_iter+1])].iloc[0]["Weight"])
            # take weight and add to list
        sum_list_weight.append(sum(temp_list_weight))
        # same idea of calculating the sum, but simpler loop since only one artist name required
        for artist in row:
            temp_list_rank.append(df_artists[df_artists["Artist Name"] == artist].iloc[0]["Rank"])
        sum_list_rank.append(sum(temp_list_rank))
    # adding the weight and rank to the df
    df["Sorting Criterion Weight Sum"] = sum_list_weight
    df["Sorting Criterion Rank Sum"] = sum_list_rank
    # sorting the df by rank 
    df.sort_values(by=["Sorting Criterion Rank Sum"], ascending=False, inplace=True)
    df.reset_index(inplace=True, drop=True)
    # return the n first rows of the df
    return df[:n]
In [308]:
get_in_contact_without_connection(G, df_nodes, df_edgelist, "Grateful Dead", 6, 5)
Out[308]:
0 1 2 3 4 5 6 Sorting Criterion Weight Sum Sorting Criterion Rank Sum
0 TLC Missy Elliott Timbaland Brad Paisley The Rolling Stones Bob Dylan Grateful Dead 19 4620
1 Juicy J Jeezy Timbaland Brad Paisley The Rolling Stones Bob Dylan Grateful Dead 14 4555
2 Lil Jon Fat Joe Mary J. Blige Eric Clapton The Rolling Stones Bob Dylan Grateful Dead 15 4477
3 Kelsea Ballerini Afrojack Timbaland Brad Paisley The Rolling Stones Bob Dylan Grateful Dead 13 4470
4 Juicy J Fat Joe Mary J. Blige Eric Clapton The Rolling Stones Bob Dylan Grateful Dead 14 4451

Natural Language Processing

In [309]:
# Load the data
df_lyrics = pd.read_csv(PATH_LYRICS_CLEAN, index_col=0, converters={"Lyrics Clean Tok": ast.literal_eval})
df_lyrics.head()
Out[309]:
Artist Title Lyrics Raw Genre Lyrics Lyrics Clean Tok Lyrics Clean No Tok Word Count
0 Drake God’s Plan [Intro]\nAnd they wishin' and wishin' and wish... pop rap . And they wishin' and wishin' and wishin' and... [wishin, wishin, wishin, wishin, movin, calm, ... wishin wishin wishin wishin movin calm not_sta... 360
1 Drake In My Feelings [Intro: Drake]\nTrap, TrapMoneyBenny\nThis shi... pop rap . Trap, TrapMoneyBenny. This shit got me in my... [trap, trapmoneybenny, shit, feeling, real, ki... trap trapmoneybenny shit feeling real kiki lov... 574
2 Drake Hotline Bling [Intro]\nYou used to call me on my\nYou used t... pop rap . You used to call me on my. You used to, you ... [used, call, used, used, used, call, cell, pho... used call used used used call cell phone need ... 449
3 Drake One Dance [Intro: Kyla]\nBaby, I like your style\n\n[Ver... pop rap . Baby, I like your style. . . Grips on your w... [baby, like, style, grip, waist, front, back, ... baby like style grip waist front back know not... 416
4 Drake Hold On, We’re Going Home [Produced by Nineteen85, Majid Jordan & Noah "... pop rap . . . I got my eyes on you. You're everything ... [eye, everything, want, love, emotion, endless... eye everything want love emotion endlessly not... 348

Let' start understanding the song lyrics by looking at the most frequently ocuring words.

In [310]:
def plot_top_n_words(df_column,n=15):
    """
    Creates a barplot of top n words.
    
        Parameters:
            df_column (pandas series): Series consisting of tokenized texts
            n (int): Number of words that should be displayed
    """
    
    # Get number of occurences of each word
    word_count = Counter([word for words in df_column for word in words])
    
    # Get most common words and store words and their counts in lists
    words,counts = map(list,zip(*word_count.most_common(n)))
    
    plt.figure(figsize=(20,10))
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    plt.barh(y=words, width=counts)
    plt.gca().invert_yaxis()
    plt.title(f"Top {n} most common words",fontsize=20)
    plt.show()

plot_top_n_words(df_lyrics["Lyrics Clean Tok"],20)

Do different genres use different words? Let's have a look.

Pop
In [311]:
plot_top_n_words(df_lyrics[df_lyrics["Genre"] == "pop"]["Lyrics Clean Tok"],15)
Rock
In [312]:
plot_top_n_words(df_lyrics[df_lyrics["Genre"] == "rock"]["Lyrics Clean Tok"],15)
Pop Rap
In [313]:
plot_top_n_words(df_lyrics[df_lyrics["Genre"] == "pop rap"]["Lyrics Clean Tok"],15)
Rap
In [314]:
plot_top_n_words(df_lyrics[df_lyrics["Genre"] == "rap"]["Lyrics Clean Tok"],15)
Pop Dance
In [315]:
plot_top_n_words(df_lyrics[df_lyrics["Genre"] == "pop dance"]["Lyrics Clean Tok"],15)
Country
In [316]:
plot_top_n_words(df_lyrics[df_lyrics["Genre"] == "country"]["Lyrics Clean Tok"],15)
Alternative Metal
In [317]:
plot_top_n_words(df_lyrics[df_lyrics["Genre"] == "alternative metal"]["Lyrics Clean Tok"],15)
Latin
In [318]:
plot_top_n_words(df_lyrics[df_lyrics["Genre"] == "latin"]["Lyrics Clean Tok"],15)

While some observations can already be made (e.g. words like nigga, bitch, fuck, shit predominantly appear in the rap category), the words distributions across the other genres look pretty much alike. We will return to this later to find a better way of determining the genres based on the wording

Apart from the content, also the word count of the songs can be analysed. To do so, let's first create a new column that stores the number of words of each song. Let's first check the distribution of song length.

In [319]:
# Count the number of tokens for each lyric
df_lyrics["Word Count"] = df_lyrics["Lyrics Clean Tok"].apply(lambda x: len(x))

# Count the number of distinct tokens for each lyric
df_lyrics["Distinct Word Count"] = df_lyrics["Lyrics Clean Tok"].apply(lambda x: len(set(x)))

# Get ratio of distinct words over all words in lyric
df_lyrics["Word Variability"] = df_lyrics["Distinct Word Count"] / df_lyrics["Word Count"]
In [320]:
df_lyrics.head()
Out[320]:
Artist Title Lyrics Raw Genre Lyrics Lyrics Clean Tok Lyrics Clean No Tok Word Count Distinct Word Count Word Variability
0 Drake God’s Plan [Intro]\nAnd they wishin' and wishin' and wish... pop rap . And they wishin' and wishin' and wishin' and... [wishin, wishin, wishin, wishin, movin, calm, ... wishin wishin wishin wishin movin calm not_sta... 108 60 0.555556
1 Drake In My Feelings [Intro: Drake]\nTrap, TrapMoneyBenny\nThis shi... pop rap . Trap, TrapMoneyBenny. This shit got me in my... [trap, trapmoneybenny, shit, feeling, real, ki... trap trapmoneybenny shit feeling real kiki lov... 171 80 0.467836
2 Drake Hotline Bling [Intro]\nYou used to call me on my\nYou used t... pop rap . You used to call me on my. You used to, you ... [used, call, used, used, used, call, cell, pho... used call used used used call cell phone need ... 181 66 0.364641
3 Drake One Dance [Intro: Kyla]\nBaby, I like your style\n\n[Ver... pop rap . Baby, I like your style. . . Grips on your w... [baby, like, style, grip, waist, front, back, ... baby like style grip waist front back know not... 156 56 0.358974
4 Drake Hold On, We’re Going Home [Produced by Nineteen85, Majid Jordan & Noah "... pop rap . . . I got my eyes on you. You're everything ... [eye, everything, want, love, emotion, endless... eye everything want love emotion endlessly not... 142 32 0.225352
In [321]:
# Plot track length
sns.displot(df_lyrics["Word Count"])
Out[321]:
<seaborn.axisgrid.FacetGrid at 0x1a9d1ec050>

The song length seems to be quite normally distributed. However, it's difficult to see as there seems to be a strong outlier at around 12,000 words. Let's identify this song.

In [322]:
df_lyrics[df_lyrics["Word Count"] == max(df_lyrics["Word Count"])]
Out[322]:
Artist Title Lyrics Raw Genre Lyrics Lyrics Clean Tok Lyrics Clean No Tok Word Count Distinct Word Count Word Variability
588 Juice WRLD Tim Westwood Freestyle [Intro]\nYeah, uh, uh\nUh-huh, huh, hahaha\nSo... rap . Yeah, uh, uh. Uh-huh, huh, hahaha. Some of y... [hahaha, sayin, like, spittin, writtens, shit,... hahaha sayin like spittin writtens shit haha n... 5039 1707 0.338758

When this outlier is removed, the data is still right-skewed as there are a few more songs outside the expected distribution.

In [323]:
sns.displot(df_lyrics[df_lyrics["Word Count"] < max(df_lyrics["Word Count"])]["Word Count"])
Out[323]:
<seaborn.axisgrid.FacetGrid at 0x1a9b65f090>

How do genres differ with regards to their song length?

In [324]:
# Group genre by their mean, median, and max of word count and their total number of occurences
grouping_wc = df_lyrics.groupby("Genre", sort=True).agg({'Word Count': ['mean', 'median', 'max',"count"]}).reset_index()
grouping_wc
Out[324]:
Genre Word Count
mean median max count
0 alternative metal 127.348115 119 477 451
1 country 116.492770 117 241 899
2 latin 154.869159 139 366 107
3 pop 137.710619 126 871 7129
4 pop dance 136.616404 117 523 1207
5 pop rap 241.050847 226 1426 1888
6 rap 245.271739 228 5039 1196
7 rock 100.855050 94 630 3277

Rappers clearly produce the longest songs. Even though the mean might be skewed by the extremely long song mentioned above, the median confirms that rap songs tend to be longer.

In [325]:
# Group genre by their mean, median, and max of distinct word count and their total number of occurences
grouping_dwc = df_lyrics.groupby("Genre", sort=True).agg({'Distinct Word Count': ['mean', 'median', 'max',"count"]}).reset_index()
grouping_dwc
Out[325]:
Genre Distinct Word Count
mean median max count
0 alternative metal 59.396896 52 229 451
1 country 59.944383 58 144 899
2 latin 80.168224 64 277 107
3 pop 61.386029 54 422 7129
4 pop dance 63.163215 50 313 1207
5 pop rap 128.311441 120 738 1888
6 rap 134.651338 123 1707 1196
7 rock 52.255722 48 436 3277
In [326]:
# Group genre by their mean, median, and max of word variability and their total number of occurences
grouping_wv = df_lyrics.groupby("Genre", sort=True).agg({'Word Variability': ['mean', 'median', 'max',"count"]}).reset_index()
grouping_wv
Out[326]:
Genre Word Variability
mean median max count
0 alternative metal 0.473203 0.465753 0.981132 451
1 country 0.527873 0.513699 0.932203 899
2 latin 0.511346 0.467153 0.897959 107
3 pop 0.456950 0.448980 1.000000 7129
4 pop dance 0.458044 0.446809 1.000000 1207
5 pop rap 0.531353 0.525500 0.920000 1888
6 rap 0.549537 0.547054 0.894009 1196
7 rock 0.536842 0.524590 1.000000 3277
In [327]:
# Look at lyrics that have variability less than 0.05 (i.e. that are extremely repetitive)
df_lyrics[df_lyrics["Word Variability"] < 0.05].head()
Out[327]:
Artist Title Lyrics Raw Genre Lyrics Lyrics Clean Tok Lyrics Clean No Tok Word Count Distinct Word Count Word Variability
1047 Marshmello Blocks [Instrumental]\n\n[Chorus]\nBass is kicking, d... pop . . . Bass is kicking, drums is drumming. Beat... [bass, kicking, drum, drumming, beatin, block,... bass kicking drum drumming beatin block hear c... 186 9 0.048387
1049 Marshmello Love U [Chorus]\nBaby, I love you\nBaby, I love you t... pop . Baby, I love you. Baby, I love you too. Baby... [baby, love, baby, love, baby, love, baby, lov... baby love baby love baby love baby love last f... 193 5 0.025907
1052 Marshmello Take It Back [Instrumental]\n\n[Drop]\n(Take it back, take ... pop . . . (Take it back, take it back, take it bac... [take, back, take, back, take, back, take, bac... take back take back take back take back take b... 120 2 0.016667
1054 Marshmello Know Me [Verse]\nEverybody know me\nEverybody know me\... pop . Everybody know me. Everybody know me. Everyb... [everybody, know, everybody, know, everybody, ... everybody know everybody know everybody know e... 171 5 0.029240
1058 Marshmello Check This Out [Hook]\nCheck this, check this, check this out... pop . Check this, check this, check this out. Chec... [check, check, check, check, check, check, che... check check check check check check check chec... 160 1 0.006250

Genre Prediction

Let us now check whether it is possible to predict the genre of a song based on its lyrics. This is done using a logistic regression.

In [328]:
def plot_result_analysis(y_test, y_pred, class_labels):

    print(classification_report(y_test,y_pred,zero_division=False))

    conf_mat = confusion_matrix(y_test, y_pred)
    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(conf_mat, annot=True, fmt='d', cmap="YlGnBu", xticklabels=class_labels, yticklabels=class_labels)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
Logistic Regression
In [329]:
X = df_lyrics["Lyrics Clean No Tok"]
y = df_lyrics["Genre"]

X_train,X_test,y_train,y_test = train_test_split(X, y, train_size = 0.75, stratify = y, random_state=RANDOM)
In [330]:
# Simple Pipe

pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('class', LogisticRegression(max_iter=1000,random_state=RANDOM)),
])

y_pred = pipeline.fit(X_train,y_train).predict(X_test)

plot_result_analysis(y_test,y_pred,pipeline.classes_)
                   precision    recall  f1-score   support

alternative metal       0.33      0.01      0.02       113
          country       0.82      0.28      0.41       225
            latin       0.00      0.00      0.00        27
              pop       0.57      0.86      0.69      1782
        pop dance       0.78      0.02      0.05       302
          pop rap       0.58      0.51      0.54       472
              rap       0.65      0.27      0.38       299
             rock       0.54      0.47      0.50       819

         accuracy                           0.57      4039
        macro avg       0.53      0.30      0.32      4039
     weighted avg       0.59      0.57      0.52      4039

In [331]:
eli5.show_weights(pipeline.named_steps["class"], vec=pipeline.named_steps["vect"], top=30)
Out[331]:
y=alternative metal top features y=country top features y=latin top features y=pop top features y=pop dance top features y=pop rap top features y=rap top features y=rock top features
Weight? Feature
+2.848 inside
+2.340 hate
+2.227 pain
+2.049 burn
+2.025 rise
+1.848 away
+1.768 fight
+1.735 save
+1.680 never
+1.494 feel
+1.476 breath
+1.471 fuck
+1.429 animal
+1.416 goodbye
+1.370 today
+1.340 breathe
+1.304 another
+1.292 world
+1.275 become
+1.271 fear
+1.229 within
+1.223 motherfucker
+1.218 riot
+1.217 sick
… 3193 more positive …
… 34462 more negative …
-1.551 said
-1.923 night
-2.171 girl
-2.234 nigga
-2.336 love
-2.648 baby
Weight? Feature
+3.935 whiskey
+3.140 beer
+2.803 little
+2.687 kiss
+2.618 country
+2.511 road
+2.291 truck
+2.132 town
+2.037 song
+2.005 good
+1.913 heart
+1.884 girl
+1.809 downtown
+1.806 drink
+1.804 cowboy
+1.754 drunk
+1.683 friday
+1.654 boot
+1.636 blue
+1.630 sitting
+1.579 buzz
+1.562 memory
+1.545 smile
+1.544 sure
… 3731 more positive …
… 33924 more negative …
-1.534 feel
-1.626 inside
-2.184 bitch
-2.440 shit
-2.845 fuck
-2.879 nigga
Weight? Feature
+2.133 havana
+1.672 released
+1.115 amor
+1.054 solitudine
+0.995 baby
+0.989 para
+0.941 estas
+0.933 pero
+0.930 fire
+0.912 manor
+0.911 loving
+0.885 extraordinary
+0.868 nicky
+0.861 know
+0.849 sway
+0.837 anything
+0.832 llora
+0.817 roll
+0.799 killed
+0.774 something
+0.772 piel
+0.757 wisin
+0.753 prefer
+0.747 lover
+0.744 mundo
+0.728 papi
+0.726 como
+0.719 shoulda
… 2231 more positive …
… 35424 more negative …
-0.755 nigga
-2.071 <BIAS>
Weight? Feature
+2.190 body
+1.995 <BIAS>
+1.612 hoping
+1.574 dolla
+1.526 running
+1.473 cool
+1.467 booty
+1.439 club
+1.346 giving
+1.289 piece
+1.285 jonas
… 14024 more positive …
… 23631 more negative …
-1.254 sound
-1.270 whiskey
-1.271 steal
-1.272 sitting
-1.284 blue
-1.290 round
-1.296 comin
-1.304 edge
-1.323 vibe
-1.333 town
-1.347 murder
-1.351 went
-1.378 daddy
-1.389 live
-1.511 across
-1.539 truck
-1.618 black
-1.778 road
-2.563 well
Weight? Feature
+2.110 cause
+1.805 chick
+1.759 till
+1.534 give
+1.471 night
+1.440 melody
+1.435 underneath
+1.368 worry
+1.335 enjoy
+1.310 darling
+1.301 reach
+1.291 shaggy
+1.229 call
+1.212 vibe
+1.208 special
+1.200 love
+1.190 nelly
+1.186 ghetto
+1.113 bass
+1.100 found
+1.095 take
+1.095 earthquake
… 5831 more positive …
… 31824 more negative …
-1.095 nothin
-1.115 though
-1.118 help
-1.247 done
-1.375 not_get
-1.465 mama
-1.502 break
-1.514 hate
Weight? Feature
+4.033 nigga
+3.562 shit
+2.696 bitch
+2.415 gravy
+2.289 fuck
+1.746 weed
+1.746 like
+1.673 plug
+1.643 shawty
+1.583 vibe
+1.578 pipe
+1.541 hoe
+1.533 though
+1.509 trap
+1.498 swear
+1.494 ridin
+1.475 shooter
+1.438 problem
+1.403 bottle
+1.391 skrrt
+1.383 cute
+1.375 homie
+1.351 famous
+1.345 homies
+1.343 bounce
+1.326 heard
+1.318 goin
+1.316 type
+1.301 hundred
… 12958 more positive …
… 24697 more negative …
-1.541 heart
Weight? Feature
+4.022 nigga
+2.951 bitch
+2.649 shit
+2.412 gang
+2.330 neck
+2.091 fuckin
+2.045 fuck
+1.992 brother
+1.728 sippin
+1.676 dawg
+1.647 okay
+1.581 rhyme
+1.562 purpp
+1.558 drug
+1.545 like
+1.539 motherfuckin
+1.533 biggie
+1.526 count
+1.523 bean
+1.500 kanye
+1.486 bust
+1.475 dogg
+1.461 fucked
+1.432 murder
+1.406 dick
+1.372 smoke
+1.372 rack
+1.357 diamond
… 11600 more positive …
… 26055 more negative …
-1.347 little
-1.512 tonight
Weight? Feature
+3.448 well
+2.284 monty
+2.155 child
+1.849 <BIAS>
+1.659 wind
+1.609 dead
+1.580 across
+1.444 time
+1.424 street
+1.409 believe
+1.380 wheel
… 9384 more positive …
… 28271 more negative …
-1.391 somethin
-1.401 girl
-1.407 swear
-1.420 gave
-1.469 hate
-1.481 club
-1.483 call
-1.545 real
-1.636 trust
-1.683 party
-1.746 every
-1.760 even
-1.894 fuck
-2.173 cause
-2.447 nigga
-2.511 not_wan
-3.110 bitch
-3.134 like
-3.164 shit

Just based on the logistic regression in its default configuration, several things already become appearant.

  • more than 50% of guesses are correct across all genres besides Latin, which point towards an overall decent performance of the classifier
  • latin is not guessed a single time, probably due to being very underrepresented in the dataset
  • pop being the majority class is guessed most frequently by a pretty big margin
  • hip hop and pop rap are confused which each other quite frequently, additionaly they have a considerable overlap with regards to their most influential features, such as nigga, shit, and bitch
  • while having a very low recall, the most influential features of both alternative metal and pop dance have an intuitive fit to the genre

Based on these observations, the following steps are taken to improve the model:

  • latin is dropped as it does not have sufficient support
  • hip hop and rap are consolidated to account for the appearant overlap between the two genres
  • use a more balanced measure than accuracy
In [332]:
# Drop Latin
df_lyrics = df_lyrics[df_lyrics["Genre"] != "latin"].reset_index()

# Drop Alternative Metal
df_lyrics = df_lyrics[df_lyrics["Genre"] != "alternative metal"].reset_index()

# Rename hip hop to rap
df_lyrics['Genre'] = df_lyrics['Genre'].replace(['pop rap'],'rap')
In [333]:
df_lyrics["Genre"].value_counts()
Out[333]:
pop          7129
rock         3277
rap          3084
pop dance    1207
country       899
Name: Genre, dtype: int64

Logistic Regression with consolidated genres

In [334]:
X = df_lyrics["Lyrics Clean No Tok"]
y = df_lyrics["Genre"]

X_train,X_test,y_train,y_test = train_test_split(X, y, train_size = 0.75, stratify = y, random_state=RANDOM)
In [335]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('class', LogisticRegression(max_iter=1000,random_state=RANDOM)),
])

y_pred = pipeline.fit(X_train,y_train).predict(X_test)

plot_result_analysis(y_test,y_pred,pipeline.classes_)
              precision    recall  f1-score   support

     country       0.77      0.21      0.33       225
         pop       0.60      0.84      0.70      1782
   pop dance       0.50      0.01      0.02       302
         rap       0.78      0.68      0.73       771
        rock       0.58      0.47      0.52       819

    accuracy                           0.63      3899
   macro avg       0.65      0.44      0.46      3899
weighted avg       0.63      0.63      0.59      3899

In [336]:
eli5.show_weights(pipeline.named_steps["class"], vec=pipeline.named_steps["vect"], top=30)
Out[336]:
y=country top features y=pop top features y=pop dance top features y=rap top features y=rock top features
Weight? Feature
+3.638 whiskey
+3.033 beer
+2.589 country
+2.473 road
+2.452 town
+2.382 kiss
+2.315 little
+2.079 friday
+2.032 truck
+1.922 cowboy
+1.842 drunk
+1.824 drink
+1.758 song
+1.745 tennessee
+1.731 boot
+1.708 good
+1.697 blue
+1.686 heart
+1.662 hell
+1.590 memory
+1.578 sitting
+1.547 daddy
+1.546 moon
+1.497 every
+1.477 girl
… 3808 more positive …
… 33368 more negative …
-1.468 <BIAS>
-1.906 bitch
-2.189 shit
-2.464 fuck
-2.646 nigga
Weight? Feature
+1.882 giving
+1.768 body
+1.559 club
+1.519 bitch
+1.471 piece
+1.432 heartbeat
+1.401 booty
+1.395 dolla
+1.327 hoping
+1.309 khaled
+1.307 shit
+1.261 <BIAS>
… 14226 more positive …
… 22950 more negative …
-1.288 evil
-1.296 smoking
-1.331 round
-1.332 freedom
-1.332 house
-1.339 ball
-1.340 black
-1.355 truck
-1.381 long
-1.445 whiskey
-1.477 bend
-1.523 across
-1.528 vibe
-1.546 child
-1.622 sound
-1.847 town
-2.047 road
-2.905 well
Weight? Feature
+2.163 cause
+1.842 chick
+1.576 worry
+1.493 give
+1.489 shaggy
+1.434 ghetto
+1.431 underneath
+1.376 reach
+1.353 special
+1.321 bend
+1.299 till
+1.294 enjoy
+1.275 vibe
+1.253 take
+1.190 motherfucking
+1.117 show
+1.059 stop
+1.033 earthquake
+1.028 name
… 5876 more positive …
… 31300 more negative …
-1.022 used
-1.033 side
-1.039 break
-1.041 town
-1.064 miss
-1.076 drive
-1.086 not_get
-1.184 mama
-1.222 done
-1.231 well
-1.292 help
Weight? Feature
+4.017 nigga
+3.883 shit
+3.227 fuck
+2.962 bitch
+2.089 drug
+2.034 fuckin
+2.030 gang
+2.001 fucked
+1.967 hoe
+1.951 smoke
+1.879 black
+1.868 vibe
+1.849 skrrt
+1.801 gravy
+1.741 shawty
+1.709 like
+1.704 homies
+1.602 check
+1.576 lean
+1.530 text
+1.519 bust
+1.516 woah
+1.507 shorty
+1.503 kanye
+1.492 whip
+1.483 dawg
+1.419 sippin
+1.410 rhyme
… 17675 more positive …
… 19501 more negative …
-1.444 light
-1.660 heart
Weight? Feature
+3.332 well
+2.075 monty
+1.943 child
+1.707 dead
+1.591 strange
+1.503 across
+1.463 away
+1.434 fear
+1.405 knife
+1.323 death
… 9544 more positive …
… 27632 more negative …
-1.333 couple
-1.347 started
-1.361 club
-1.386 swear
-1.404 whatever
-1.413 keep
-1.444 girl
-1.458 every
-1.510 fuck
-1.534 real
-1.543 call
-1.618 even
-1.634 not_got
-1.901 party
-2.123 nigga
-2.195 shit
-2.313 cause
-2.438 not_wan
-2.898 bitch
-3.160 like
In [337]:
X = df_lyrics["Lyrics Clean No Tok"]
y = df_lyrics["Genre"]

X_train,X_test,y_train,y_test = train_test_split(X, y, train_size = 0.75, stratify = y, random_state=RANDOM)
In [338]:
# Pipe with GridSearch LogReg (takes quite long to run)

X = df_lyrics["Lyrics Clean No Tok"]
y = df_lyrics["Genre"]

# Split dataset into train and test data
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.75,stratify=y,random_state=RANDOM)

# Define paramters for grid search
parameters = {"vect__max_df": [0.1, 0.3, 1.0],
              "vect__max_features":[10000, 50000, None],
              "class__C":[0.1, 1, 5, 10]}

# Define Pipeline Steps
pipeline = Pipeline([('vect', TfidfVectorizer()),
                     ('class', LogisticRegression(max_iter=1000,
                                                  random_state=RANDOM))])

# Train the classifier through cross-validated grid search
clf = GridSearchCV(pipeline,
                   parameters,
                   cv=3,
                   scoring = "precision_macro",
                   n_jobs = -1,
                   verbose = 10)
clf.fit(X_train,y_train)

# Predict y-values using the classifier
y_pred = clf.predict(X_test)

# Calculate accuracy of the model
score = accuracy_score(y_test,y_pred)
print("Accuracy Score:",score)

# Create and print confusion matrix and classification report
cm = confusion_matrix(y_test,y_pred)
print(cm)
print(classification_report(y_test,y_pred,zero_division=False))
Fitting 3 folds for each of 36 candidates, totalling 108 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   41.0s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   51.8s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  9.2min finished
Accuracy Score: 0.6142600666837651
[[  45  141    0    2   37]
 [   5 1511    1  102  163]
 [   1  221    4   38   38]
 [   1  235    0  521   14]
 [   4  488    1   12  314]]
              precision    recall  f1-score   support

     country       0.80      0.20      0.32       225
         pop       0.58      0.85      0.69      1782
   pop dance       0.67      0.01      0.03       302
         rap       0.77      0.68      0.72       771
        rock       0.55      0.38      0.45       819

    accuracy                           0.61      3899
   macro avg       0.68      0.42      0.44      3899
weighted avg       0.63      0.61      0.57      3899

In [339]:
clf.best_params_
Out[339]:
{'class__C': 1, 'vect__max_df': 0.1, 'vect__max_features': 50000}
In [340]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(max_df=0.1,max_features=50000)),
    ('class', LogisticRegression(max_iter=1000,random_state=RANDOM, C=1)),
])

pipeline.fit(X_train,y_train)

y_pred = pipeline.predict(X_test)

plot_result_analysis(y_test,y_pred,pipeline.classes_)

eli5.show_weights(pipeline.named_steps["class"], vec=pipeline.named_steps["vect"], top=30)
              precision    recall  f1-score   support

     country       0.80      0.20      0.32       225
         pop       0.58      0.85      0.69      1782
   pop dance       0.67      0.01      0.03       302
         rap       0.77      0.68      0.72       771
        rock       0.55      0.38      0.45       819

    accuracy                           0.61      3899
   macro avg       0.68      0.42      0.44      3899
weighted avg       0.63      0.61      0.57      3899

Out[340]:
y=country top features y=pop top features y=pop dance top features y=rap top features y=rock top features
Weight? Feature
+3.776 whiskey
+3.410 beer
+2.600 road
+2.596 town
+2.588 country
+2.494 kiss
+2.244 friday
+2.202 truck
+1.932 cowboy
+1.893 drunk
+1.876 song
+1.842 drink
+1.828 blue
+1.826 memory
+1.813 boot
+1.798 daddy
+1.774 tennessee
+1.754 sitting
+1.631 moon
+1.617 neon
+1.585 buzz
+1.562 hell
+1.533 downtown
+1.488 kind
+1.482 hair
+1.447 anything
+1.429 porch
+1.423 dirt
+1.395 easy
+1.394 lucky
… 3718 more positive …
… 33356 more negative …
Weight? Feature
+1.932 giving
+1.677 club
+1.519 heartbeat
+1.451 booty
+1.444 piece
+1.395 hoping
+1.371 khaled
+1.326 dolla
+1.317 cardi
+1.316 slowly
+1.297 <BIAS>
… 14240 more positive …
… 22834 more negative …
-1.297 hang
-1.304 cocaine
-1.318 evil
-1.332 black
-1.333 sand
-1.338 seat
-1.344 house
-1.358 vibe
-1.414 smoking
-1.425 went
-1.429 ball
-1.442 truck
-1.514 child
-1.546 sound
-1.571 whiskey
-1.602 bend
-1.608 across
-1.917 road
-2.099 town
Weight? Feature
+1.976 chick
+1.647 shaggy
+1.626 worry
+1.613 special
+1.596 till
+1.580 underneath
+1.567 reach
+1.522 ghetto
+1.368 bend
+1.312 vibe
+1.297 enjoy
+1.246 motherfucking
+1.186 letting
+1.109 ocean
+1.103 close
+1.097 language
+1.070 wham
+1.070 nelly
+1.065 earthquake
+1.063 sixteen
+1.057 steady
+1.055 echo
+1.051 least
… 5823 more positive …
… 31251 more negative …
-1.061 used
-1.068 miss
-1.068 not_tell
-1.106 not_get
-1.116 drive
-1.259 mama
-1.266 help
Weight? Feature
+2.955 fuckin
+2.887 hoe
+2.639 fucked
+2.523 gang
+2.212 smoke
+2.123 skrrt
+2.037 gravy
+2.012 drug
+1.999 shawty
+1.913 fucking
+1.912 dawg
+1.899 whip
+1.889 homies
+1.850 bust
+1.840 thug
+1.800 vibe
+1.781 black
+1.779 trap
+1.743 type
+1.720 check
+1.720 dick
+1.716 lean
+1.675 motherfucker
+1.674 ridin
+1.667 smoking
+1.662 text
+1.657 murder
+1.644 plug
+1.641 shorty
… 17877 more positive …
… 19197 more negative …
-1.575 kiss
Weight? Feature
+2.116 child
+1.936 monty
+1.863 strange
+1.718 across
+1.663 dead
+1.469 death
+1.432 knife
+1.421 fear
+1.349 bleed
+1.330 wind
+1.294 santana
… 9347 more positive …
… 27727 more negative …
-1.285 giving
-1.297 drug
-1.312 single
-1.357 probably
-1.378 front
-1.396 fuckin
-1.423 somethin
-1.429 pussy
-1.454 started
-1.463 whole
-1.517 gettin
-1.620 couple
-1.630 hoe
-1.721 club
-1.835 not_got
-1.870 swear
-1.892 whatever
-1.894 party
-2.378 not_wan

Logistic Regression with Oversampling

In [341]:
X = df_lyrics["Lyrics Clean No Tok"]
y = df_lyrics["Genre"]


tfidf = TfidfVectorizer(max_features=50000)
X_vect = tfidf.fit_transform(X)


X_train,X_test,y_train,y_test = train_test_split(X_vect, y, train_size = 0.75, stratify = y, random_state=RANDOM)
In [342]:
y_train.value_counts()
Out[342]:
pop          5347
rock         2458
rap          2313
pop dance     905
country       674
Name: Genre, dtype: int64
In [343]:
# Instantiate random oversampler
oversample = RandomOverSampler(random_state=RANDOM)

# Oversample the train data
X_train_over, y_train_over = oversample.fit_sample(X_train, y_train)
In [344]:
y_train_over.value_counts()
Out[344]:
rock         5347
rap          5347
pop dance    5347
pop          5347
country      5347
Name: Genre, dtype: int64
In [345]:
# Define paramters for grid search
parameters = {"class__C":[0.1, 1, 5, 10]}

# Define Pipeline Steps
pipeline = Pipeline([('class', LogisticRegression(max_iter=1000, random_state=RANDOM))])

# Train the classifier through cross-validated grid search
clf = GridSearchCV(pipeline,
                   parameters,
                   cv=3,
                   scoring = "precision_macro",
                   n_jobs = -1,
                   verbose = 10)
clf.fit(X_train_over,y_train_over)

y_pred = clf.predict(X_test)
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   50.2s
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:  1.4min remaining:   58.2s
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:  1.9min remaining:   37.2s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  2.4min finished
In [346]:
clf.best_params_
Out[346]:
{'class__C': 10}
In [347]:
pipeline = Pipeline([
    ('class', LogisticRegression(max_iter=1000,random_state=RANDOM, C=10)),
])

pipeline.fit(X_train_over,y_train_over)

y_pred = pipeline.predict(X_test)

plot_result_analysis(y_test,y_pred,pipeline.classes_)

eli5.show_weights(pipeline.named_steps["class"], vec=tfidf, top=30)
              precision    recall  f1-score   support

     country       0.49      0.52      0.50       225
         pop       0.64      0.60      0.62      1782
   pop dance       0.18      0.22      0.20       302
         rap       0.72      0.70      0.71       771
        rock       0.51      0.56      0.54       819

    accuracy                           0.58      3899
   macro avg       0.51      0.52      0.51      3899
weighted avg       0.59      0.58      0.58      3899

Out[347]:
y=country top features y=pop top features y=pop dance top features y=rap top features y=rock top features
Weight? Feature
+10.226 whiskey
+8.171 beer
+7.344 thoughtless
+7.035 friday
+6.860 truck
+6.346 tennessee
+6.312 quit
+6.153 boot
+6.073 buzz
+6.028 country
+6.027 colder
+6.023 sitting
+5.749 suspicion
+5.717 reaching
+5.696 stroll
+5.671 vote
+5.603 downtown
+5.580 learning
+5.573 kiss
+5.524 blonde
+5.418 drunk
+5.403 dial
+5.348 choosin
+5.340 walked
+5.308 land
+5.285 church
+5.279 worldwide
… 3701 more positive …
… 33475 more negative …
-7.557 nigga
-8.098 shit
-9.095 fuck
Weight? Feature
+5.758 slowly
+5.752 khaled
+5.054 giving
+5.035 dolla
+4.793 justin
+4.485 booty
+4.452 honestly
+4.439 heartbeat
+4.389 glitter
+4.386 charli
+4.332 sunlight
+4.293 club
+4.211 cardi
… 14971 more positive …
… 22205 more negative …
-4.151 sound
-4.166 walked
-4.187 aight
-4.192 freedom
-4.206 commit
-4.262 shoe
-4.299 sacred
-4.317 well
-4.360 across
-4.392 evil
-4.457 respect
-4.766 load
-4.896 pushing
-4.971 speaker
-4.988 truck
-5.137 bend
-5.448 cocaine
Weight? Feature
+7.407 shaggy
+6.262 luda
+5.952 underneath
+5.948 commit
+5.715 flag
+5.686 unwinds
+5.677 advert
+5.660 neighbour
+5.627 seventy
+5.611 oooooh
+5.599 twiddly
+5.472 not_beat
+5.422 vida
+5.289 tangible
+5.285 dealing
+5.256 casualty
+5.187 armed
+5.183 downed
+5.171 caring
+5.091 swingin
+5.031 chick
+5.028 placed
+4.980 language
+4.955 piled
… 5894 more positive …
… 31282 more negative …
-5.028 fighting
-5.155 perfect
-5.331 luck
-5.359 not_forget
-5.423 drive
-6.407 help
Weight? Feature
+6.860 shit
+6.273 thug
+5.883 shorty
+5.765 fuck
+5.126 fucked
+5.109 smoke
+5.033 gravy
+4.898 selfish
+4.890 pride
+4.878 candy
+4.844 hennessy
+4.665 kanye
+4.569 hoe
+4.560 nigga
+4.555 homies
+4.507 montana
+4.481 grind
+4.477 scared
+4.470 vibe
+4.451 prolly
+4.418 trust
+4.407 bein
+4.383 gush
+4.360 dolo
+4.338 bitch
+4.296 fuckin
+4.264 lately
+4.252 le
… 16937 more positive …
… 20239 more negative …
-4.518 kind
-5.797 monty
Weight? Feature
+9.620 monty
+5.736 well
+5.499 tend
+5.434 not_belong
+5.376 santana
+5.261 strange
+5.045 psychotic
+4.922 twisted
+4.743 not_complain
+4.731 bleed
+4.601 engine
+4.560 inch
+4.531 aight
+4.467 appears
+4.461 swallowing
+4.442 moonage
+4.411 noise
+4.390 across
… 9324 more positive …
… 27852 more negative …
-4.388 plane
-4.423 not_wan
-4.511 admit
-4.622 whatever
-4.623 unwinds
-4.694 tough
-4.732 not_lose
-4.863 travel
-4.921 started
-5.026 chest
-5.095 hurting
-5.883 couple

Topic Modelling

In [348]:
word_dict = Dictionary(df_lyrics["Lyrics Clean Tok"])
print(len(word_dict))
42079
In [349]:
word_count = Counter([word for words in df_lyrics["Lyrics Clean Tok"] for word in words])

word_count_values = list(word_count.values())

print(f"Total amount of unique words: {len(word_count)}")
print(f"Total amount of unique words that appear more than 5000 times: {len([i for i in word_count_values if i > 5000])}")
print(f"Total amount of unique words that appear more than 2000 times: {len([i for i in word_count_values if i > 2000])}")
print(f"Total amount of unique words that appear more than 1000 times: {len([i for i in word_count_values if i > 1000])}")
print(f"Total amount of unique words that appear more than 10 times: {len([i for i in word_count_values if i > 10])}")
print(f"Total amount of unique words that appear less than 5 times: {len([i for i in word_count_values if i < 5])}")
print(f"Total amount of unique words that appear 1 time: {len([i for i in word_count_values if i <= 2])}")
perc_25 = np.quantile(word_count_values,0.25)
perc_50 = np.quantile(word_count_values,0.5)
perc_75 = np.quantile(word_count_values,0.75)
perc_99 = np.quantile(word_count_values,0.99)
perc_995 = np.quantile(word_count_values,0.995)
perc_999 = np.quantile(word_count_values,0.999)
print(f"\n\n25% Percentile at {perc_25}.\nTotal amount of unique words that appear more than this percentile: {len([i for i in word_count_values if i > perc_25])}\n\n")
print(f"50% Percentile at {perc_50}.\nTotal amount of unique words that appear more than this percentile: {len([i for i in word_count_values if i > perc_50])}\n\n")
print(f"75% Percentile at {perc_75}.\nTotal amount of unique words that appear more than this percentile: {len([i for i in word_count_values if i > perc_75])}\n\n")
print(f"99% Percentile at {perc_99}.\nTotal amount of unique words that appear more than this percentile: {len([i for i in word_count_values if i > perc_99])}\n\n")
print(f"99.5% Percentile at {perc_995}.\nTotal amount of unique words that appear more than this percentile: {len([i for i in word_count_values if i > perc_995])}\n\n")
print(f"99.9% Percentile at {perc_999}.\nTotal amount of unique words that appear more than this percentile: {len([i for i in word_count_values if i > perc_999])}")
Total amount of unique words: 42079
Total amount of unique words that appear more than 5000 times: 58
Total amount of unique words that appear more than 2000 times: 203
Total amount of unique words that appear more than 1000 times: 407
Total amount of unique words that appear more than 10 times: 10081
Total amount of unique words that appear less than 5 times: 26136
Total amount of unique words that appear 1 time: 20965


25% Percentile at 1.0.
Total amount of unique words that appear more than this percentile: 26682


50% Percentile at 3.0.
Total amount of unique words that appear more than this percentile: 17999


75% Percentile at 10.0.
Total amount of unique words that appear more than this percentile: 10081


99% Percentile at 976.4400000000023.
Total amount of unique words that appear more than this percentile: 421


99.5% Percentile at 1914.6600000000035.
Total amount of unique words that appear more than this percentile: 211


99.9% Percentile at 5827.219999999987.
Total amount of unique words that appear more than this percentile: 43
In [350]:
too_frequent_words = [word for word, count in word_count.items() if count > perc_999]

too_unfrequent_words = [word for word, count in word_count.items() if count < 10]
manual_remove = ("something someone everything everybody nothing somebody really nobody okay woah whoa another cause take never "
                 "back want ever forever last first well without enough world life better alright look give live think would still hand "
                 "nigga bitch fuck fuckin good shit little keep").split(" ")
In [351]:
too_frequent_words_ids = [word_dict.token2id[word] for word in too_frequent_words]
too_unfrequent_words_ids = [word_dict.token2id[word] for word in too_unfrequent_words]
manual_remove_ids = [word_dict.token2id[word] for word in manual_remove]
In [352]:
words_to_delete = manual_remove_ids + too_unfrequent_words_ids + too_frequent_words_ids
In [353]:
word_dict.filter_tokens(bad_ids=words_to_delete)
print(len(word_dict))
10625
In [354]:
corpus_track = [word_dict.doc2bow(doc) for doc in df_lyrics["Lyrics Clean Tok"]]
In [355]:
# Create dictionary for perplexity scores
perplexity_scores = dict()

# Create dictionary for coherence scores
coherence_scores = dict()

# Iterate over 2 to 15 topics (takes quite long to execute)
for i in tqdm(range(2,16)):
    
    # Create an LDA Model
    lda_model = LdaModel(corpus_track,
                         id2word=word_dict,
                         num_topics=i,
                         random_state=RANDOM,
                         passes = 3,
                         alpha="auto")
    
    # Calculate perplexity score and append it to dict
    perplexity = lda_model.log_perplexity(corpus_track)
    perplexity_scores[i] = perplexity
    
    # Calculate coherence score and append it to dict
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=df_lyrics["Lyrics Clean Tok"],
                                         dictionary=word_dict,
                                         coherence='c_v')
    
    coherence_scores[i] = coherence_model_lda.get_coherence()
    
100%|██████████| 14/14 [18:56<00:00, 81.21s/it]
In [356]:
# Plot perplexity scores
plt.plot(perplexity_scores.keys(), perplexity_scores.values())
plt.xlabel("Number of Topics")
plt.ylabel("Perplexity")
plt.title("Perplexity Scores")
plt.show()
In [357]:
# Plot Coherence scores
plt.plot(coherence_scores.keys(), coherence_scores.values())
plt.xlabel("Number of Topics")
plt.ylabel("Coherence")
plt.title("Coherence Scores")
plt.show()
Try LDA Model with 3 topics
In [358]:
lda_model_track_3 = LdaModel(corpus_track,
                             id2word=word_dict,
                             num_topics=3,
                             random_state=RANDOM,
                             passes = 3,
                             alpha="auto")
lda_model_track_3.print_topics(-1)
Out[358]:
[(0,
  '0.006*"eye" + 0.006*"always" + 0.006*"home" + 0.006*"light" + 0.005*"hold" + 0.005*"not_you" + 0.005*"feeling" + 0.005*"not_know" + 0.005*"long" + 0.005*"find"'),
 (1,
  '0.006*"real" + 0.006*"dumb" + 0.005*"young" + 0.004*"party" + 0.004*"check" + 0.004*"hoe" + 0.004*"ride" + 0.004*"pull" + 0.004*"name" + 0.004*"black"'),
 (2,
  '0.007*"talk" + 0.006*"crazy" + 0.005*"friend" + 0.005*"beat" + 0.005*"real" + 0.005*"bring" + 0.005*"show" + 0.005*"bang" + 0.005*"ready" + 0.005*"walk"')]
In [359]:
lda_display_3 = pyLDAvis.gensim.prepare(lda_model_track_3, corpus_track, word_dict)
pyLDAvis.display(lda_display_3)
Out[359]:
Try LDA Model with 6 topics
In [360]:
lda_model_track_6 = LdaModel(corpus_track,
                             id2word=word_dict,
                             num_topics=6,
                             random_state=RANDOM,
                             passes = 3,
                             alpha="auto")
lda_model_track_6.print_topics(-1)
Out[360]:
[(0,
  '0.008*"home" + 0.008*"always" + 0.007*"eye" + 0.007*"light" + 0.007*"hold" + 0.007*"feeling" + 0.006*"long" + 0.006*"find" + 0.006*"not_know" + 0.006*"tonight"'),
 (1,
  '0.011*"fire" + 0.010*"high" + 0.010*"party" + 0.009*"black" + 0.009*"young" + 0.008*"rain" + 0.007*"dance" + 0.007*"move" + 0.006*"body" + 0.006*"town"'),
 (2,
  '0.019*"friend" + 0.017*"not_you" + 0.016*"talk" + 0.013*"walk" + 0.012*"mine" + 0.010*"wish" + 0.010*"not_need" + 0.010*"babe" + 0.010*"pretty" + 0.009*"bang"'),
 (3,
  '0.007*"pull" + 0.007*"hoe" + 0.006*"ride" + 0.006*"work" + 0.005*"pussy" + 0.005*"diamond" + 0.004*"rich" + 0.004*"throw" + 0.004*"club" + 0.004*"body"'),
 (4,
  '0.013*"real" + 0.012*"dumb" + 0.010*"crazy" + 0.006*"goin" + 0.006*"bring" + 0.005*"side" + 0.005*"game" + 0.005*"gang" + 0.004*"problem" + 0.004*"feelin"'),
 (5,
  '0.009*"check" + 0.008*"hell" + 0.008*"beat" + 0.007*"hate" + 0.007*"fucking" + 0.006*"name" + 0.006*"turn" + 0.006*"please" + 0.006*"made" + 0.005*"minute"')]
In [361]:
lda_display_6 = pyLDAvis.gensim.prepare(lda_model_track_6, corpus_track, word_dict)
pyLDAvis.display(lda_display_6)
Out[361]:
Try LDA Model with 13 topics
In [362]:
lda_model_track_13 = LdaModel(corpus_track,
                              id2word=word_dict,
                              num_topics=13,
                              random_state=RANDOM,
                              passes = 3,
                              alpha="auto")
lda_model_track_13.print_topics(-1)
Out[362]:
[(0,
  '0.015*"tonight" + 0.014*"light" + 0.013*"home" + 0.012*"eye" + 0.010*"dream" + 0.010*"long" + 0.010*"always" + 0.009*"feeling" + 0.009*"change" + 0.009*"lost"'),
 (1,
  '0.027*"high" + 0.026*"young" + 0.024*"party" + 0.023*"rain" + 0.018*"move" + 0.014*"heaven" + 0.012*"alive" + 0.011*"roll" + 0.009*"town" + 0.009*"hello"'),
 (2,
  '0.044*"friend" + 0.031*"talk" + 0.030*"walk" + 0.021*"not_need" + 0.016*"falling" + 0.015*"anymore" + 0.014*"round" + 0.014*"not_care" + 0.014*"believe" + 0.014*"not_have"'),
 (3,
  '0.011*"name" + 0.009*"club" + 0.008*"shawty" + 0.007*"drink" + 0.007*"shot" + 0.007*"pull" + 0.007*"diamond" + 0.007*"cash" + 0.006*"freak" + 0.006*"bottle"'),
 (4,
  '0.023*"real" + 0.023*"crazy" + 0.020*"dumb" + 0.015*"stay" + 0.011*"not_know" + 0.010*"goin" + 0.010*"much" + 0.009*"bring" + 0.008*"problem" + 0.007*"feelin"'),
 (5,
  '0.026*"not_want" + 0.026*"hell" + 0.025*"check" + 0.014*"wake" + 0.011*"minute" + 0.009*"trust" + 0.009*"listen" + 0.009*"girlfriend" + 0.009*"nothin" + 0.007*"bass"'),
 (6,
  '0.049*"not_you" + 0.027*"head" + 0.026*"show" + 0.023*"turn" + 0.019*"not_let" + 0.017*"find" + 0.014*"not_stop" + 0.014*"not_get" + 0.013*"please" + 0.013*"free"'),
 (7,
  '0.055*"going" + 0.052*"work" + 0.042*"mine" + 0.035*"side" + 0.028*"lonely" + 0.025*"maybe" + 0.019*"summer" + 0.014*"follow" + 0.013*"line" + 0.012*"not_love"'),
 (8,
  '0.034*"body" + 0.014*"sweet" + 0.013*"kiss" + 0.013*"loving" + 0.012*"game" + 0.011*"mama" + 0.009*"babe" + 0.009*"ready" + 0.008*"honey" + 0.008*"play"'),
 (9,
  '0.012*"hoe" + 0.011*"gang" + 0.009*"rich" + 0.008*"pussy" + 0.007*"dope" + 0.007*"whole" + 0.007*"dick" + 0.006*"fucking" + 0.006*"pull" + 0.006*"shoot"'),
 (10,
  '0.014*"hold" + 0.009*"gone" + 0.009*"even" + 0.009*"not_wan" + 0.008*"much" + 0.008*"thought" + 0.007*"hurt" + 0.007*"break" + 0.007*"leave" + 0.007*"coming"'),
 (11,
  '0.033*"dance" + 0.023*"fire" + 0.016*"black" + 0.012*"burn" + 0.011*"higher" + 0.011*"wild" + 0.010*"blue" + 0.010*"rock" + 0.010*"dead" + 0.009*"deep"'),
 (12,
  '0.025*"ride" + 0.017*"bang" + 0.015*"throw" + 0.013*"shake" + 0.011*"black" + 0.009*"street" + 0.009*"house" + 0.008*"hood" + 0.008*"watch" + 0.007*"bounce"')]
In [363]:
lda_display_13 = pyLDAvis.gensim.prepare(lda_model_track_13, corpus_track, word_dict)
pyLDAvis.display(lda_display_13)
Out[363]:

Based on the visulaisations of the different topic configurations, the LDA model with 6 topics was determined to be the most suitable one. For that reason, each lyric is now assigned the topic that it belongs to the most.

In [371]:
def assign_topic(values):
    """
    Function takes list of topic,value pairings as input and returns the topic with the highest value, given that
    this value is larger than 0.25
    """
    
    max_value = 0.25
    max_topic = None
    
    for topic, value in values:
        if value > max_value:
            max_value = value
            max_topic = topic
        else:
            continue
    return max_topic
In [372]:
# Iterate over all tracks from corpus and create list with the assigned topic of each track
topics = []
for i in range(len(corpus_track)):
    topics.append(assign_topic(lda_model_track_6[corpus_track][i]))
In [373]:
# Add topic column to the dataframe
df_lyrics["Topic"] = topics
In [374]:
# Count number of unclassified topics
df_lyrics["Topic"].isna().sum()
Out[374]:
25
In [375]:
df_lyrics["Topic"].value_counts()
Out[375]:
0.0    8700
3.0    2329
1.0    1464
4.0    1413
2.0     834
5.0     831
Name: Topic, dtype: int64
In [376]:
# Assign labels to the topics
df_lyrics["Topic"] = df_lyrics["Topic"].map({0: "Feelings",
                                             1: "Party",
                                             2: "Relationship",
                                             3: "Gangstar",
                                             4: "Rebel",
                                             5: "Rage"})
In [377]:
pd.crosstab(df_lyrics["Genre"], df_lyrics["Topic"])
Out[377]:
Topic Feelings Gangstar Party Rage Rebel Relationship
Genre
country 568 12 215 26 43 34
pop 4648 606 510 380 522 456
pop dance 697 144 128 54 94 88
rap 527 1477 122 208 673 68
rock 2260 90 489 163 81 188
In [378]:
pd.crosstab(df_lyrics["Genre"], df_lyrics["Topic"]).apply(lambda r: round(r/r.sum(),2), axis=1)
Out[378]:
Topic Feelings Gangstar Party Rage Rebel Relationship
Genre
country 0.63 0.01 0.24 0.03 0.05 0.04
pop 0.65 0.09 0.07 0.05 0.07 0.06
pop dance 0.58 0.12 0.11 0.04 0.08 0.07
rap 0.17 0.48 0.04 0.07 0.22 0.02
rock 0.69 0.03 0.15 0.05 0.02 0.06
In [379]:
pd.crosstab(df_lyrics["Genre"], df_lyrics["Topic"]).apply(lambda r: round(r/r.sum(),2), axis=0)
Out[379]:
Topic Feelings Gangstar Party Rage Rebel Relationship
Genre
country 0.07 0.01 0.15 0.03 0.03 0.04
pop 0.53 0.26 0.35 0.46 0.37 0.55
pop dance 0.08 0.06 0.09 0.06 0.07 0.11
rap 0.06 0.63 0.08 0.25 0.48 0.08
rock 0.26 0.04 0.33 0.20 0.06 0.23

Similar Song Finder

In [380]:
def get_top_n_indices(iterable,n):
    
    """
    Returns indices of highest values in descending order.
    
        Parameters:
            iterable (array): Cosine similarity values of the respective song
            n (integer): Indicates how many indices should be returned
            
        Returns:
            list of indices of top n similarity values
    """
    
    # Get top n indices from input row
    top_n_values = np.partition(iterable,-n)[-n:]
    top_n_indices = np.argpartition(iterable,-n)[-n:]
    
    # Put indices in descending order (hence the negative iterable) and return the result
    top_n_values_sorted = sorted(top_n_values, reverse=True)
    top_n_indices_sorted = top_n_indices[np.argsort(-iterable[top_n_indices])]
    return top_n_values_sorted, top_n_indices_sorted
In [381]:
def find_most_similar_text(title, top_n):
    """
    Prints out table of top n songs that are most similar to input song by retrieving values from a song similarity matrix created
    outside this function.
    
        Parameters:
            title (string): Title of the track for which similar tracks should be found
            top_n (int): Number of similar songs that should be displayed
    """
    
    tab = PrettyTable(["Position","Index","Title","Artist","Score","Genre","Topic"])
    
    # Check if input is string
    if type(title) is str:
        
        # Find index for song title (if title appears several times, first appearance is chosen)
        index = df_lyrics.index[df_lyrics["Title"] == title][0]
        #print(index)
        
    else:
        index = title
    
    # Find dataframe row with index from input
    query_data = df_lyrics.iloc[index,:]
    
    # Extract Artist and Title from dataframe
    query_artist = query_data["Artist"]
    query_title = query_data["Title"]
    query_genre = query_data["Genre"]
    query_topic = query_data["Topic"]
    
    # Find lyrics with highest similarity
    top_values, top_indices = get_top_n_indices(song_similarity[index,:],top_n)
    
    # Get information about similar songs and append them to table
    for ctr, (val, ind) in enumerate(zip(top_values,top_indices)):
        artist = df_lyrics.iloc[ind,:]["Artist"]
        title = df_lyrics.iloc[ind,:]["Title"]
        genre = df_lyrics.iloc[ind,:]["Genre"]
        topic = df_lyrics.iloc[ind,:]["Topic"]
        
        tab.add_row([ctr+1,ind,title,artist,round(val,3), genre, topic])
    
    # Display similarity table
    print(f'Most similar lyrics to "{query_title}" by {query_artist} (Index = {index}, Genre = {query_genre}, Topic = {query_topic}).\n')
    print(tab)
In [382]:
def compare_lyrics(*ids):
    """
    Shows the lyrics for an arbitrary selection of songs.
    
        Parameters:
            *ids (comma-separated ints): Indices of the songs that should be compared
    """
    
    # Iterate over input ids
    for ID in ids:
        
        # Retrieve row from dataframe and corresponding column values
        df = df_lyrics.iloc[ID,:]
        artist = df["Artist"]
        title = df["Title"]
        lyrics = df["Lyrics"]
        
        # Print song lyrics
        print(f'"{title}" by {artist} (ID = {ID})\n\n{lyrics}\n\n\n')
In [383]:
# Create tfidf representation of lyrics
vectorizer = TfidfVectorizer()
track_lyrics_tfidf = vectorizer.fit_transform(df_lyrics["Lyrics Clean No Tok"])
In [384]:
track_lyrics_tfidf.shape
Out[384]:
(15596, 42079)
In [385]:
# Create song similarity matrix basd on cosine similarity of lyrics
song_similarity = cosine_similarity(track_lyrics_tfidf)

# Set identical song's similarity to zero
np.fill_diagonal(song_similarity,0)
In [386]:
song_similarity.shape
Out[386]:
(15596, 15596)
In [387]:
find_most_similar_text("Wish You Were Here",10)
Most similar lyrics to "Wish You Were Here" by Pink Floyd (Index = 2512, Genre = rock, Topic = Feelings).

+----------+-------+----------------------+--------------+-------+---------+--------------+
| Position | Index |        Title         |    Artist    | Score |  Genre  |    Topic     |
+----------+-------+----------------------+--------------+-------+---------+--------------+
|    1     | 13783 |      Bet U Wish      |     RAYE     | 0.255 |   pop   | Relationship |
|    2     |  3229 |   Let You Love Me    |   Rita Ora   | 0.249 |   pop   | Relationship |
|    3     | 12336 |    Wish You Would    | Marian Hill  | 0.234 |   pop   | Relationship |
|    4     | 10800 | I Wish You Were Beer | Dustin Lynch | 0.218 | country | Relationship |
|    5     | 12778 |  Wish You Were Here  |   Incubus    | 0.215 |   rock  |   Feelings   |
|    6     |  8988 | Wish You Were Sober  |  Conan Gray  |  0.21 |   pop   | Relationship |
|    7     | 12045 |     David Watts      |  The Kinks   | 0.205 |   rock  | Relationship |
|    8     |  5926 |    Tears and Rain    | James Blunt  | 0.199 |   pop   |   Feelings   |
|    9     |  7499 |     I Wish I Was     | Maren Morris | 0.198 | country |   Feelings   |
|    10    |  7798 |      Airplanes       |    B.o.B     | 0.187 |   pop   | Relationship |
+----------+-------+----------------------+--------------+-------+---------+--------------+
In [388]:
compare_lyrics(3157,3534)
"U Remind Me" by Usher (ID = 3157)

. Yo, I ain't seen you in a minute. But I got something to tell you. Listen. . . See, the thing about you that caught my eye. Is the same thing that makes me change my mind. Kind of hard to explain, but girl, I'll try. You need to sit down, this may take a while. See this girl, she sort of looks just like you. She even smiles just the way you do. So innocent she seemed, but I was fooled. I'm reminded when I look at you. . . You remind me of a girl, that I once knew. See her face whenever I, I look at you. You won't believe all of the things she put me through. This is why I just can't get with you. . . Thought that she was the one for me. Til I found out she was on her creep. Ooh, she was sexing everyone, but me. This is why we could never be. . . You remind me of a girl, that I once knew. See her face whenever I, I look at you. You won't believe all of the things she put me through. This is why I just can't get with you. . . I know it's so unfair to you (it's so unfair). That I relate her ignorance to you. Wish I knew. Wish I knew how to separate the two. You remind me (you remind me). . . Whoa whoa oh oh oh oh. You remind me of a girl that I once knew. See her face whenever I, I look at you. You won't believe all of the things she put me through. This is why I just can't get with you, (gotta let you go). You remind me of a girl that I once knew. See her face whenever I, I look at you. You won't believe all of the things she put me through (all the shit she put me through). This is why I just can't get with you, no no. You remind me of a girl that I once knew. See her face whenever I, I look at you. You won't believe all of the things she put me through. This is why I just can't get with you. You remind me of a girl that I once knew. See her face whenever I, I look at you. You won't believe all of the things she put me through. This is why I just can't get with you. You remind me of a girl that I once knew. See her face whenever I, I look at you. You won't believe all of the things she put me through. This is why I just can't get with you. . . You remind me of a girl that I once knew. See her face whenever I, I look at you. Wouldn't believe all of the things she put me through. This is why I just can't get with you



"Big Bad Wolf" by Fifth Harmony (ID = 3534)

. Kiss for a kiss, my two lips got you racing. Baby we're guilty, wherever we are. Angel confess, let me be your salvation. I'll carry you through the dark. . . Let me give, let me give you something to believe in. You don't gotta, you don't gotta even have a reason. (You don’t even gotta have, gotta have a reason). You can love me, you can love me baby all weekend. Come on, come on over here and sink your teeth in. . . Cause if you wanna talk baby use your hands. If you wanna go make a move, you can. If you want a shot, baby cock and pull. If you're gonna bite, be a big bad wolf. If you wanna talk baby use your hands. If you wanna go make a move, you can. If you want a shot, baby cock and pull. If you're gonna bite, be a big bad wolf. . . Whisper your secrets and mine will take over. Swing like your chariot and let me ride. Drunk off my body, you'll never be sober. I'll be your poison tonight. . . Let me give, let me give you something to believe in. You don't gotta, you don't gotta even have a reason. (You don’t even gotta have, gotta have a reason). You can love me, you can love me baby all weekend. Come on, come on over here and sink your teeth in. . . Cause if you wanna talk baby use your hands. If you wanna go make a move, you can. If you want a shot, baby cock and pull. If you're gonna bite, be a big bad wolf. Cause if you wanna talk baby use your hands. If you wanna go make a move, you can. If you want a shot, baby cock and pull. If you're gonna bite, be a big bad wolf. . . Don't hold back back baby. Give me that baby. Come do anything you want. La-la-la-la. Don't hold back baby. Give me that baby. I'll do anything you want. . . Cause if you wanna talk baby use your hands. If you wanna go make a move, you can. If you want a shot, baby cock and pull. If you're gonna bite, be a big bad wolf. Cause if you wanna talk baby use your hands. If you wanna go make a move, you can. If you want a shot, baby cock and pull. If you're gonna bite, be a big bad wolf



Similar Artist Finder

In [389]:
def find_most_similar_artist(artist, top_n):
    
    tab = PrettyTable(["Position","Index","Artist","Score","Genre"])
    
    # Get artist index
    artist_index = artist_lyrics_complete.index[artist_lyrics_complete["Artist"] == artist][0]
    artist_genre = artist_lyrics_complete[artist_lyrics_complete["Artist"] == artist]["Genre"].values[0]
    
    # Find artist with highest similarity
    top_values, top_indices = get_top_n_indices(artist_similarity_matrix[artist_index,:],top_n)
    
    # Get information about similar artists and append them to table
    for ctr, (val, ind) in enumerate(zip(top_values,top_indices)):
        artist_comp = artist_lyrics_complete.iloc[ind,:]["Artist"]
        genre_comp = artist_lyrics_complete.iloc[ind,:]["Genre"]
        
        tab.add_row([ctr+1,ind,artist_comp,round(val,3),genre_comp])
    
    # Display similarity table
    print(f'Most similar artist to "{artist}" (Index = {artist_index}, Genre = {artist_genre}).\n')
    print(tab)
In [390]:
# Create dataframe that contains artists and all their available lyrics
artist_lyrics_complete = df_lyrics.groupby(["Artist"],sort=False)["Lyrics Clean No Tok"].apply(lambda x: " ".join(x)).reset_index()
artist_lyrics_complete["Lyrics Clean Tok"] = df_lyrics.groupby(["Artist"],sort=False)["Lyrics Clean Tok"].apply(lambda x: list(x)).reset_index()["Lyrics Clean Tok"]
artist_lyrics_complete["Lyrics Clean Tok"] = artist_lyrics_complete["Lyrics Clean Tok"].apply(lambda x: [word for l in x for word in l])
artist_lyrics_complete = artist_lyrics_complete.merge(df_lyrics[["Artist","Genre"]], left_on="Artist",right_on="Artist").drop_duplicates("Artist").reset_index(drop=True)
artist_lyrics_complete.tail()
Out[390]:
Artist Lyrics Clean No Tok Lyrics Clean Tok Genre
657 Tech N9ne kalk minik mikrofonunu getir burada diktafonun... [kalk, minik, mikrofonunu, getir, burada, dikt... rap
658 The National gold light break behind house not_see strange ... [gold, light, break, behind, house, not_see, s... rock
659 Old Dominion turn turn turn back weekend word want flirt dr... [turn, turn, turn, back, weekend, word, want, ... country
660 Gigi D'Agostino mind head came dream love share waiting mind h... [mind, head, came, dream, love, share, waiting... pop
661 Don Toliver know know know drunk tell want club know nasty... [know, know, know, drunk, tell, want, club, kn... rap
In [391]:
# Create tfidf representation of artist lyrics
vectorizer = TfidfVectorizer(ngram_range=(1,3), max_features=30000)
artist_lyrics_tfidf = vectorizer.fit_transform(artist_lyrics_complete["Lyrics Clean No Tok"])
In [392]:
artist_lyrics_tfidf.shape
Out[392]:
(662, 30000)
In [393]:
artist_similarity_matrix = cosine_similarity(artist_lyrics_tfidf)
np.fill_diagonal(artist_similarity_matrix,0)
artist_similarity_matrix.shape
Out[393]:
(662, 662)
In [394]:
find_most_similar_artist("Blake Shelton",10)
Most similar artist to "Blake Shelton" (Index = 271, Genre = country).

+----------+-------+------------------+-------+---------+
| Position | Index |      Artist      | Score |  Genre  |
+----------+-------+------------------+-------+---------+
|    1     |  467  |  Cole Swindell   | 0.219 | country |
|    2     |  549  | Billy Currington | 0.213 | country |
|    3     |  236  |   Jason Aldean   | 0.213 | country |
|    4     |  552  |   Chris Young    | 0.212 | country |
|    5     |  380  | Brantley Gilbert | 0.207 | country |
|    6     |  189  |    Luke Combs    | 0.206 | country |
|    7     |  312  |   Eric Church    | 0.202 | country |
|    8     |  529  |  Rascal Flatts   | 0.198 | country |
|    9     |  453  |   Dustin Lynch   | 0.198 | country |
|    10    |  565  |   Brad Paisley   | 0.197 | country |
+----------+-------+------------------+-------+---------+